twitterscraper-ruby 0.8.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5a0a2d55fac0a72e83d696c088daa6ca84b7b13519fbbe7a259dd1979373039a
4
- data.tar.gz: a6cf2a0793f05d03d8d9b489eba985a244c7dce9f70e935d03207a7e103d6365
3
+ metadata.gz: bafdfd47b386ef7f717dc5846102c8a5153f4660e61d3559f6834cdca340c19c
4
+ data.tar.gz: fb5564629d89ae83c916d868e9fd401fdca1b423fbeb2d6945b0831c0d8ecf11
5
5
  SHA512:
6
- metadata.gz: 3b4ca939b22a48fc53e1c1cb9ea25f55cdd6f8a53eb26fa1733948a8df44cd46fa51884668a70bbc31e85c4b986172d23995633557644b5ea93d7640b4034cf9
7
- data.tar.gz: 9b1d61933990c916734fc6722bc12e6fdda513c4532edcb86982feabc30dabeaa13f39db03c8555fb8ddaa2aafc0493cb88069fbc374515737ed1465522f153b
6
+ metadata.gz: 5e9c819b318a908c73f56de0a638c7c819cc1e31c867812ec9ffa3e23362318db3dfc1fe5ffde53c4769bffdf1f62efdb4701bf9b1efe874625cdb6ce21ef1bc
7
+ data.tar.gz: aa75f3a328f6c2c278738962e7d6e9ea747841343362e8a0f226fd76b316b6ab05e63c93e495ae39009fd3f0a1c4eb0657bc66e1b22416e6a695cc34b4059643
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ -fd
2
+ --require spec_helper
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "minitest", "~> 5.0"
8
+ gem "rspec"
@@ -1,19 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.8.0)
4
+ twitterscraper-ruby (0.13.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
+ diff-lcs (1.4.4)
11
12
  mini_portile2 (2.4.0)
12
13
  minitest (5.14.1)
13
14
  nokogiri (1.10.10)
14
15
  mini_portile2 (~> 2.4.0)
15
16
  parallel (1.19.2)
16
17
  rake (12.3.3)
18
+ rspec (3.9.0)
19
+ rspec-core (~> 3.9.0)
20
+ rspec-expectations (~> 3.9.0)
21
+ rspec-mocks (~> 3.9.0)
22
+ rspec-core (3.9.2)
23
+ rspec-support (~> 3.9.3)
24
+ rspec-expectations (3.9.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-mocks (3.9.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.9.0)
30
+ rspec-support (3.9.3)
17
31
 
18
32
  PLATFORMS
19
33
  ruby
@@ -21,6 +35,7 @@ PLATFORMS
21
35
  DEPENDENCIES
22
36
  minitest (~> 5.0)
23
37
  rake (~> 12.0)
38
+ rspec
24
39
  twitterscraper-ruby!
25
40
 
26
41
  BUNDLED WITH
data/README.md CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
33
33
 
34
34
  ```shell script
35
35
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ --limit 100 --threads 10 --proxy --cache --output output.json
37
37
  ```
38
38
 
39
39
  From Within Ruby:
@@ -56,12 +56,60 @@ tweets = client.query_tweets(KEYWORD, options)
56
56
  tweets.each do |tweet|
57
57
  puts tweet.tweet_id
58
58
  puts tweet.text
59
- puts tweet.created_at
60
59
  puts tweet.tweet_url
60
+ puts tweet.created_at
61
+
62
+ hash = tweet.attrs
63
+ puts hash.keys
61
64
  end
62
65
  ```
63
66
 
64
67
 
68
+ ## Attributes
69
+
70
+ ### Tweet
71
+
72
+ - screen_name
73
+ - name
74
+ - user_id
75
+ - tweet_id
76
+ - text
77
+ - links
78
+ - hashtags
79
+ - image_urls
80
+ - video_url
81
+ - has_media
82
+ - likes
83
+ - retweets
84
+ - replies
85
+ - is_replied
86
+ - is_reply_to
87
+ - parent_tweet_id
88
+ - reply_to_users
89
+ - tweet_url
90
+ - created_at
91
+
92
+
93
+ ## Search operators
94
+
95
+ | Operator | Finds Tweets... |
96
+ | ------------- | ------------- |
97
+ | watching now | containing both "watching" and "now". This is the default operator. |
98
+ | "happy hour" | containing the exact phrase "happy hour". |
99
+ | love OR hate | containing either "love" or "hate" (or both). |
100
+ | beer -root | containing "beer" but not "root". |
101
+ | #haiku | containing the hashtag "haiku". |
102
+ | from:interior | sent from Twitter account "interior". |
103
+ | to:NASA | a Tweet authored in reply to Twitter account "NASA". |
104
+ | @NASA | mentioning Twitter account "NASA". |
105
+ | puppy filter:media | containing "puppy" and an image or video. |
106
+ | puppy -filter:retweets | containing "puppy", filtering out retweets |
107
+ | superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
108
+ | puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
109
+
110
+ Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
111
+
112
+
65
113
  ## Examples
66
114
 
67
115
  ```shell script
@@ -79,40 +127,26 @@ $ cat tweets.json | jq . | less
79
127
  "tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
80
128
  "created_at": "2020-07-13 12:00:00 +0000",
81
129
  "text": "Thanks Twitter!"
82
- },
83
- ...
130
+ }
84
131
  ]
85
132
  ```
86
133
 
87
- ## Attributes
88
-
89
- ### Tweet
90
-
91
- - tweet_id
92
- - text
93
- - user_id
94
- - screen_name
95
- - name
96
- - links
97
- - hashtags
98
- - image_urls
99
- - tweet_url
100
- - created_at
101
-
102
-
103
134
  ## CLI Options
104
135
 
105
136
  | Option | Description | Default |
106
137
  | ------------- | ------------- | ------------- |
107
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
108
139
  | `--query` | Specify a keyword used during the search. | |
109
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
110
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
111
142
  | `--lang` | Retrieve tweets written in a specific language. | |
112
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
113
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
114
145
  | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--cache` | Enable caching. | false |
147
+ | `--format` | The format of the output. | json |
115
148
  | `--output` | The name of the output file. | tweets.json |
149
+ | `--verbose` | Print debug messages. | tweets.json |
116
150
 
117
151
 
118
152
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
8
10
  require 'version'
9
11
 
10
12
  module Twitterscraper
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -20,12 +20,24 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
23
24
  threads: options['threads'],
24
- proxy: options['proxy']
25
25
  }
26
- client = Twitterscraper::Client.new
26
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
27
27
  tweets = client.query_tweets(options['query'], query_options)
28
- File.write(options['output'], generate_json(tweets))
28
+ export(tweets) unless tweets.empty?
29
+ end
30
+
31
+ def export(tweets)
32
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
33
+
34
+ if options['format'] == 'json'
35
+ write_json.call
36
+ elsif options['format'] == 'html'
37
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
38
+ else
39
+ write_json.call
40
+ end
29
41
  end
30
42
 
31
43
  def generate_json(tweets)
@@ -51,17 +63,26 @@ module Twitterscraper
51
63
  'end_date:',
52
64
  'lang:',
53
65
  'limit:',
66
+ 'daily_limit:',
54
67
  'threads:',
55
68
  'output:',
56
- 'proxy',
69
+ 'format:',
70
+ 'cache:',
71
+ 'proxy:',
57
72
  'pretty',
58
73
  'verbose',
59
74
  )
60
75
 
76
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
61
77
  options['lang'] ||= ''
62
78
  options['limit'] = (options['limit'] || 100).to_i
79
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
63
80
  options['threads'] = (options['threads'] || 2).to_i
64
- options['output'] ||= 'tweets.json'
81
+ options['format'] ||= 'json'
82
+ options['output'] ||= "tweets.#{options['format']}"
83
+
84
+ options['cache'] = options['cache'] != 'false'
85
+ options['proxy'] = options['proxy'] != 'false'
65
86
 
66
87
  options
67
88
  end
@@ -86,7 +107,7 @@ module Twitterscraper
86
107
  end
87
108
 
88
109
  def print_version
89
- puts "twitterscraper-#{Twitterscraper::VERSION}"
110
+ puts "twitterscraper-#{VERSION}"
90
111
  end
91
112
  end
92
113
  end
@@ -1,5 +1,18 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
4
17
  end
5
18
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,14 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ http_request = lambda do
83
+ logger.debug "Scraping tweets from #{url}"
84
+ get_single_page(url, headers, proxies)
85
+ end
79
86
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
87
+ if cache_enabled?
88
+ client = Cache.new
89
+ if (response = client.read(url))
90
+ logger.debug 'Fetching tweets from cache'
91
+ else
92
+ response = http_request.call
93
+ client.write(url, response) unless stop_requested?
94
+ end
95
+ else
96
+ response = http_request.call
97
+ end
98
+ return [], nil if response.nil? || response.empty?
82
99
 
83
100
  html, json_resp = parse_single_page(response, pos.nil?)
84
101
 
@@ -97,35 +114,36 @@ module Twitterscraper
97
114
  end
98
115
  end
99
116
 
100
- OLDEST_DATE = Date.parse('2006-3-21')
117
+ OLDEST_DATE = Date.parse('2006-03-21')
101
118
 
102
- def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
119
+ def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
120
+ query = queries[0]
103
121
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
122
+ raise Error.new('Please specify a search query.')
105
123
  end
106
124
 
107
125
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
126
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
127
  end
110
128
 
111
129
  if start_date && end_date
112
130
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
131
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
132
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
133
+ raise Error.new(':start_date must occur before :end_date.')
116
134
  end
117
135
  end
118
136
 
119
137
  if start_date
120
138
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
139
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
140
  end
123
141
  end
124
142
 
125
143
  if end_date
126
144
  today = Date.today
127
145
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
146
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
147
  end
130
148
  end
131
149
  end
@@ -143,27 +161,32 @@ module Twitterscraper
143
161
  end
144
162
  end
145
163
 
146
- def main_loop(query, lang, limit, headers, proxies)
164
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
147
165
  pos = nil
166
+ daily_tweets = []
148
167
 
149
168
  while true
150
169
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
151
170
  unless new_tweets.empty?
171
+ daily_tweets.concat(new_tweets)
172
+ daily_tweets.uniq! { |t| t.tweet_id }
173
+
152
174
  @mutex.synchronize {
153
175
  @all_tweets.concat(new_tweets)
154
176
  @all_tweets.uniq! { |t| t.tweet_id }
155
177
  }
156
178
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
179
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
180
 
159
181
  break unless new_pos
182
+ break if daily_limit && daily_tweets.size >= daily_limit
160
183
  break if @all_tweets.size >= limit
161
184
 
162
185
  pos = new_pos
163
186
  end
164
187
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
188
+ if !@stop_requested && @all_tweets.size >= limit
189
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
190
  @stop_requested = true
168
191
  end
169
192
  end
@@ -172,32 +195,46 @@ module Twitterscraper
172
195
  @stop_requested
173
196
  end
174
197
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
198
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2)
176
199
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
200
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
201
  queries = build_queries(query, start_date, end_date)
179
- threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ if threads > queries.size
203
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
204
+ threads = queries.size
205
+ end
206
+ if proxy_enabled?
207
+ proxies = Proxy::Pool.new
208
+ logger.debug "Fetch #{proxies.size} proxies"
209
+ else
210
+ proxies = []
211
+ logger.debug 'Proxy disabled'
212
+ end
213
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
181
214
 
182
- validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
215
 
184
- logger.info("The number of threads #{threads}")
216
+ validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
217
+
218
+ logger.info "The number of threads #{threads}"
185
219
 
186
220
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
221
+ logger.info "Headers #{headers}"
188
222
 
189
223
  @all_tweets = []
190
224
  @mutex = Mutex.new
191
225
  @stop_requested = false
192
226
 
193
227
  if threads > 1
228
+ Thread.abort_on_exception = true
229
+ logger.debug "Set 'Thread.abort_on_exception' to true"
230
+
194
231
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
232
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
196
233
  raise Parallel::Break if stop_requested?
197
234
  end
198
235
  else
199
236
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
237
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
201
238
  break if stop_requested?
202
239
  end
203
240
  end
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -21,6 +21,7 @@ module Twitterscraper
21
21
  :parent_tweet_id,
22
22
  :reply_to_users,
23
23
  :tweet_url,
24
+ :timestamp,
24
25
  :created_at,
25
26
  ]
26
27
  attr_reader *KEYS
@@ -31,13 +32,25 @@ module Twitterscraper
31
32
  end
32
33
  end
33
34
 
34
- def to_json(options = {})
35
+ def attrs
35
36
  KEYS.map do |key|
36
37
  [key, send(key)]
37
- end.to_h.to_json
38
+ end.to_h
39
+ end
40
+
41
+ def to_json(options = {})
42
+ attrs.to_json
38
43
  end
39
44
 
40
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
41
54
  def from_html(text)
42
55
  html = Nokogiri::HTML(text)
43
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -46,12 +59,19 @@ module Twitterscraper
46
59
  def from_tweets_html(html)
47
60
  html.map do |tweet|
48
61
  from_tweet_html(tweet)
49
- end
62
+ end.compact
50
63
  end
51
64
 
52
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
53
74
  inner_html = Nokogiri::HTML(html.inner_html)
54
- tweet_id = html.attr('data-tweet-id').to_i
55
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
56
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
57
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -74,9 +94,9 @@ module Twitterscraper
74
94
  reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
75
95
  end
76
96
 
77
- timestamp = inner_html.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]").first.attr('data-time').to_i
97
+ timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
78
98
  new(
79
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
80
100
  name: html.attr('data-name'),
81
101
  user_id: html.attr('data-user-id').to_i,
82
102
  tweet_id: tweet_id,
@@ -94,6 +114,7 @@ module Twitterscraper
94
114
  parent_tweet_id: parent_tweet_id,
95
115
  reply_to_users: reply_to_users,
96
116
  tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
117
+ timestamp: timestamp,
97
118
  created_at: Time.at(timestamp, in: '+00:00'),
98
119
  )
99
120
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.8.0'
2
+ VERSION = '0.13.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -48,6 +48,7 @@ extra_rdoc_files: []
48
48
  files:
49
49
  - ".gitignore"
50
50
  - ".irbrc"
51
+ - ".rspec"
51
52
  - ".ruby-version"
52
53
  - ".travis.yml"
53
54
  - CODE_OF_CONDUCT.md
@@ -61,6 +62,7 @@ files:
61
62
  - bin/twitterscraper
62
63
  - lib/twitterscraper-ruby.rb
63
64
  - lib/twitterscraper.rb
65
+ - lib/twitterscraper/cache.rb
64
66
  - lib/twitterscraper/cli.rb
65
67
  - lib/twitterscraper/client.rb
66
68
  - lib/twitterscraper/http.rb
@@ -68,6 +70,7 @@ files:
68
70
  - lib/twitterscraper/logger.rb
69
71
  - lib/twitterscraper/proxy.rb
70
72
  - lib/twitterscraper/query.rb
73
+ - lib/twitterscraper/template.rb
71
74
  - lib/twitterscraper/tweet.rb
72
75
  - lib/version.rb
73
76
  - twitterscraper-ruby.gemspec