twitterscraper-ruby 0.7.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9cfd03782734642da8ac29839788f142399d2a3f4ec601e8b6f47ae1ca38c17f
4
- data.tar.gz: 07a398e51fd2fbdc735ae27008d9a23e97dc390632179738045db4c81bd4fcad
3
+ metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
4
+ data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
5
5
  SHA512:
6
- metadata.gz: 6f417fe3379a3d9d134c308a9ea9d4e01b458018c9c5a3f8508a85e7f5890d01991838cfcabe87b8246f69edf4458c66d17924359798017907862071353f643d
7
- data.tar.gz: 758bcb55ded936c3696f99647f64bc9921386b3cb0c783c218510c0e36991ae6b95a9d08fa071e02072c8b727bbadb6674ceeb19a74e356a842d62c1ec4c038f
6
+ metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
7
+ data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.7.0)
4
+ twitterscraper-ruby (0.12.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
33
33
 
34
34
  ```shell script
35
35
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ --limit 100 --threads 10 --proxy --cache --output output.json
37
37
  ```
38
38
 
39
39
  From Within Ruby:
@@ -56,12 +56,60 @@ tweets = client.query_tweets(KEYWORD, options)
56
56
  tweets.each do |tweet|
57
57
  puts tweet.tweet_id
58
58
  puts tweet.text
59
- puts tweet.created_at
60
59
  puts tweet.tweet_url
60
+ puts tweet.created_at
61
+
62
+ hash = tweet.attrs
63
+ puts hash.keys
61
64
  end
62
65
  ```
63
66
 
64
67
 
68
+ ## Attributes
69
+
70
+ ### Tweet
71
+
72
+ - screen_name
73
+ - name
74
+ - user_id
75
+ - tweet_id
76
+ - text
77
+ - links
78
+ - hashtags
79
+ - image_urls
80
+ - video_url
81
+ - has_media
82
+ - likes
83
+ - retweets
84
+ - replies
85
+ - is_replied
86
+ - is_reply_to
87
+ - parent_tweet_id
88
+ - reply_to_users
89
+ - tweet_url
90
+ - created_at
91
+
92
+
93
+ ## Search operators
94
+
95
+ | Operator | Finds Tweets... |
96
+ | ------------- | ------------- |
97
+ | watching now | containing both "watching" and "now". This is the default operator. |
98
+ | "happy hour" | containing the exact phrase "happy hour". |
99
+ | love OR hate | containing either "love" or "hate" (or both). |
100
+ | beer -root | containing "beer" but not "root". |
101
+ | #haiku | containing the hashtag "haiku". |
102
+ | from:interior | sent from Twitter account "interior". |
103
+ | to:NASA | a Tweet authored in reply to Twitter account "NASA". |
104
+ | @NASA | mentioning Twitter account "NASA". |
105
+ | puppy filter:media | containing "puppy" and an image or video. |
106
+ | puppy -filter:retweets | containing "puppy", filtering out retweets |
107
+ | superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
108
+ | puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
109
+
110
+ Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
111
+
112
+
65
113
  ## Examples
66
114
 
67
115
  ```shell script
@@ -79,37 +127,26 @@ $ cat tweets.json | jq . | less
79
127
  "tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
80
128
  "created_at": "2020-07-13 12:00:00 +0000",
81
129
  "text": "Thanks Twitter!"
82
- },
83
- ...
130
+ }
84
131
  ]
85
132
  ```
86
133
 
87
- ## Attributes
88
-
89
- ### Tweet
90
-
91
- - tweet_id
92
- - text
93
- - user_id
94
- - screen_name
95
- - name
96
- - tweet_url
97
- - created_at
98
-
99
-
100
134
  ## CLI Options
101
135
 
102
136
  | Option | Description | Default |
103
137
  | ------------- | ------------- | ------------- |
104
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
105
139
  | `--query` | Specify a keyword used during the search. | |
106
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
107
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
108
142
  | `--lang` | Retrieve tweets written in a specific language. | |
109
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
110
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
111
145
  | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--cache` | Enable caching. | false |
147
+ | `--format` | The format of the output. | json |
112
148
  | `--output` | The name of the output file. | tweets.json |
149
+ | `--verbose` | Print debug messages. | tweets.json |
113
150
 
114
151
 
115
152
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
8
10
  require 'version'
9
11
 
10
12
  module Twitterscraper
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -20,12 +20,25 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
23
24
  threads: options['threads'],
24
25
  proxy: options['proxy']
25
26
  }
26
- client = Twitterscraper::Client.new
27
+ client = Twitterscraper::Client.new(cache: options['cache'])
27
28
  tweets = client.query_tweets(options['query'], query_options)
28
- File.write(options['output'], generate_json(tweets))
29
+ export(tweets) unless tweets.empty?
30
+ end
31
+
32
+ def export(tweets)
33
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
34
+
35
+ if options['format'] == 'json'
36
+ write_json.call
37
+ elsif options['format'] == 'html'
38
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
39
+ else
40
+ write_json.call
41
+ end
29
42
  end
30
43
 
31
44
  def generate_json(tweets)
@@ -51,17 +64,23 @@ module Twitterscraper
51
64
  'end_date:',
52
65
  'lang:',
53
66
  'limit:',
67
+ 'daily_limit:',
54
68
  'threads:',
55
69
  'output:',
70
+ 'format:',
71
+ 'cache',
56
72
  'proxy',
57
73
  'pretty',
58
74
  'verbose',
59
75
  )
60
76
 
77
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
61
78
  options['lang'] ||= ''
62
79
  options['limit'] = (options['limit'] || 100).to_i
80
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
63
81
  options['threads'] = (options['threads'] || 2).to_i
64
- options['output'] ||= 'tweets.json'
82
+ options['format'] ||= 'json'
83
+ options['output'] ||= "tweets.#{options['format']}"
65
84
 
66
85
  options
67
86
  end
@@ -86,7 +105,7 @@ module Twitterscraper
86
105
  end
87
106
 
88
107
  def print_version
89
- puts "twitterscraper-#{Twitterscraper::VERSION}"
108
+ puts "twitterscraper-#{VERSION}"
90
109
  end
91
110
  end
92
111
  end
@@ -1,5 +1,13 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: false)
6
+ @cache = cache
7
+ end
8
+
9
+ def cache_enabled?
10
+ @cache
11
+ end
4
12
  end
5
13
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,14 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ http_request = lambda do
83
+ logger.debug "Scraping tweets from #{url}"
84
+ get_single_page(url, headers, proxies)
85
+ end
79
86
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
87
+ if cache_enabled?
88
+ client = Cache.new
89
+ if (response = client.read(url))
90
+ logger.debug 'Fetching tweets from cache'
91
+ else
92
+ response = http_request.call
93
+ client.write(url, response) unless stop_requested?
94
+ end
95
+ else
96
+ response = http_request.call
97
+ end
98
+ return [], nil if response.nil? || response.empty?
82
99
 
83
100
  html, json_resp = parse_single_page(response, pos.nil?)
84
101
 
@@ -97,35 +114,35 @@ module Twitterscraper
97
114
  end
98
115
  end
99
116
 
100
- OLDEST_DATE = Date.parse('2006-3-21')
117
+ OLDEST_DATE = Date.parse('2006-03-21')
101
118
 
102
119
  def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
103
120
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
121
+ raise Error.new('Please specify a search query.')
105
122
  end
106
123
 
107
124
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
125
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
126
  end
110
127
 
111
128
  if start_date && end_date
112
129
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
130
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
131
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
132
+ raise Error.new(':start_date must occur before :end_date.')
116
133
  end
117
134
  end
118
135
 
119
136
  if start_date
120
137
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
138
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
139
  end
123
140
  end
124
141
 
125
142
  if end_date
126
143
  today = Date.today
127
144
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
145
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
146
  end
130
147
  end
131
148
  end
@@ -143,27 +160,32 @@ module Twitterscraper
143
160
  end
144
161
  end
145
162
 
146
- def main_loop(query, lang, limit, headers, proxies)
163
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
147
164
  pos = nil
165
+ daily_tweets = []
148
166
 
149
167
  while true
150
168
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
151
169
  unless new_tweets.empty?
170
+ daily_tweets.concat(new_tweets)
171
+ daily_tweets.uniq! { |t| t.tweet_id }
172
+
152
173
  @mutex.synchronize {
153
174
  @all_tweets.concat(new_tweets)
154
175
  @all_tweets.uniq! { |t| t.tweet_id }
155
176
  }
156
177
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
178
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
179
 
159
180
  break unless new_pos
181
+ break if daily_limit && daily_tweets.size >= daily_limit
160
182
  break if @all_tweets.size >= limit
161
183
 
162
184
  pos = new_pos
163
185
  end
164
186
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
187
+ if !@stop_requested && @all_tweets.size >= limit
188
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
189
  @stop_requested = true
168
190
  end
169
191
  end
@@ -172,32 +194,36 @@ module Twitterscraper
172
194
  @stop_requested
173
195
  end
174
196
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
197
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
176
198
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
199
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
200
  queries = build_queries(query, start_date, end_date)
179
201
  threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ proxies = proxy ? Proxy::Pool.new : []
181
203
 
182
204
  validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
205
 
184
- logger.info("The number of threads #{threads}")
206
+ logger.debug "Fetch #{proxies.size} proxies" if proxy
207
+ logger.info "The number of threads #{threads}"
185
208
 
186
209
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
210
+ logger.info "Headers #{headers}"
188
211
 
189
212
  @all_tweets = []
190
213
  @mutex = Mutex.new
191
214
  @stop_requested = false
192
215
 
193
216
  if threads > 1
217
+ Thread.abort_on_exception = true
218
+ logger.debug "Set 'Thread.abort_on_exception' to true"
219
+
194
220
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
221
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
196
222
  raise Parallel::Break if stop_requested?
197
223
  end
198
224
  else
199
225
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
226
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
201
227
  break if stop_requested?
202
228
  end
203
229
  end
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -2,7 +2,28 @@ require 'time'
2
2
 
3
3
  module Twitterscraper
4
4
  class Tweet
5
- KEYS = [:screen_name, :name, :user_id, :tweet_id, :tweet_url, :created_at, :text]
5
+ KEYS = [
6
+ :screen_name,
7
+ :name,
8
+ :user_id,
9
+ :tweet_id,
10
+ :text,
11
+ :links,
12
+ :hashtags,
13
+ :image_urls,
14
+ :video_url,
15
+ :has_media,
16
+ :likes,
17
+ :retweets,
18
+ :replies,
19
+ :is_replied,
20
+ :is_reply_to,
21
+ :parent_tweet_id,
22
+ :reply_to_users,
23
+ :tweet_url,
24
+ :timestamp,
25
+ :created_at,
26
+ ]
6
27
  attr_reader *KEYS
7
28
 
8
29
  def initialize(attrs)
@@ -11,13 +32,25 @@ module Twitterscraper
11
32
  end
12
33
  end
13
34
 
14
- def to_json(options = {})
35
+ def attrs
15
36
  KEYS.map do |key|
16
37
  [key, send(key)]
17
- end.to_h.to_json
38
+ end.to_h
39
+ end
40
+
41
+ def to_json(options = {})
42
+ attrs.to_json
18
43
  end
19
44
 
20
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
21
54
  def from_html(text)
22
55
  html = Nokogiri::HTML(text)
23
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -26,20 +59,63 @@ module Twitterscraper
26
59
  def from_tweets_html(html)
27
60
  html.map do |tweet|
28
61
  from_tweet_html(tweet)
29
- end
62
+ end.compact
30
63
  end
31
64
 
32
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
33
74
  inner_html = Nokogiri::HTML(html.inner_html)
75
+ text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
+ links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
+ image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
78
+ video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
79
+ has_media = !image_urls.empty? || (video_url && !video_url.empty?)
80
+
81
+ actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
82
+ likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
83
+ retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
84
+ replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
85
+ is_replied = replies != 0
86
+
87
+ parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
88
+ if tweet_id == parent_tweet_id
89
+ is_reply_to = false
90
+ parent_tweet_id = nil
91
+ reply_to_users = []
92
+ else
93
+ is_reply_to = true
94
+ reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
95
+ end
96
+
34
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
35
98
  new(
36
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
37
100
  name: html.attr('data-name'),
38
101
  user_id: html.attr('data-user-id').to_i,
39
- tweet_id: html.attr('data-tweet-id').to_i,
102
+ tweet_id: tweet_id,
103
+ text: text,
104
+ links: links,
105
+ hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
106
+ image_urls: image_urls,
107
+ video_url: video_url,
108
+ has_media: has_media,
109
+ likes: likes,
110
+ retweets: retweets,
111
+ replies: replies,
112
+ is_replied: is_replied,
113
+ is_reply_to: is_reply_to,
114
+ parent_tweet_id: parent_tweet_id,
115
+ reply_to_users: reply_to_users,
40
116
  tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
117
+ timestamp: timestamp,
41
118
  created_at: Time.at(timestamp, in: '+00:00'),
42
- text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
43
119
  )
44
120
  end
45
121
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.7.0'
2
+ VERSION = '0.12.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -61,6 +61,7 @@ files:
61
61
  - bin/twitterscraper
62
62
  - lib/twitterscraper-ruby.rb
63
63
  - lib/twitterscraper.rb
64
+ - lib/twitterscraper/cache.rb
64
65
  - lib/twitterscraper/cli.rb
65
66
  - lib/twitterscraper/client.rb
66
67
  - lib/twitterscraper/http.rb
@@ -68,6 +69,7 @@ files:
68
69
  - lib/twitterscraper/logger.rb
69
70
  - lib/twitterscraper/proxy.rb
70
71
  - lib/twitterscraper/query.rb
72
+ - lib/twitterscraper/template.rb
71
73
  - lib/twitterscraper/tweet.rb
72
74
  - lib/version.rb
73
75
  - twitterscraper-ruby.gemspec