twitterscraper-ruby 0.9.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 59b71fc6129f6d8c5a441981dc1577fa9b761380ff119bed4985cfcd88ccb31b
4
- data.tar.gz: 2de3fcadc334ee2689d3083ea9324127c3b22ec94cf1b08dec920f9c95771445
3
+ metadata.gz: cf902c947e866cc99e79fbb9f8a51c829accd44aed03ef7657562bf41932c73d
4
+ data.tar.gz: 1bc5a0698a17b244ee9228d7728767dd00218179a5a49e0852a74cc722322ef0
5
5
  SHA512:
6
- metadata.gz: b1e392bc021f6f758b79b7bdcd099af2ac391863f8712dadb5fd19248946867cfd89f140b836532fb40554c82697b26ef3af00b7cbb2cb13b0d5a8e2a38c87e7
7
- data.tar.gz: 8c0e81589202e4a094c17604354f0f23a08b4536fe60b58ffe616cf1233c0531547ef02b8e88b6f70b1870ce2d134e4518ee093a5349144e2edfce3b1088e06c
6
+ metadata.gz: 629de8698af1391c210b496e9aadb51ad5f9d7157b1be5d0aa669ae821671e2b5624ba51083fb14b61f93618ff3e90aea1ac0eccb6ea00360fac48a2dfc436c7
7
+ data.tar.gz: 3f3706bee5f2a92a2addae034201e2e8cee3fef43efdc323be963cbaf1b94c31c53aa49a19e58a068498722dfe07e9796e097fb04364a9afda56d06132e6b935
@@ -0,0 +1,31 @@
1
+ version: 2.1
2
+ orbs:
3
+ ruby: circleci/ruby@0.1.2
4
+
5
+ jobs:
6
+ build:
7
+ docker:
8
+ - image: circleci/ruby:2.6.4-stretch-node
9
+ environment:
10
+ BUNDLER_VERSION: 2.1.4
11
+ executor: ruby/default
12
+ steps:
13
+ - checkout
14
+ - run:
15
+ name: Update bundler
16
+ command: gem update bundler
17
+ - run:
18
+ name: Which bundler?
19
+ command: bundle -v
20
+ - restore_cache:
21
+ keys:
22
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
23
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}
24
+ - gem-cache-v1
25
+ - run: bundle install --path vendor/bundle
26
+ - run: bundle clean
27
+ - save_cache:
28
+ key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
29
+ paths:
30
+ - vendor/bundle
31
+ - run: bundle exec rspec
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ -fd
2
+ --require spec_helper
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "minitest", "~> 5.0"
8
+ gem "rspec"
@@ -1,19 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.9.0)
4
+ twitterscraper-ruby (0.14.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
+ diff-lcs (1.4.4)
11
12
  mini_portile2 (2.4.0)
12
13
  minitest (5.14.1)
13
14
  nokogiri (1.10.10)
14
15
  mini_portile2 (~> 2.4.0)
15
16
  parallel (1.19.2)
16
17
  rake (12.3.3)
18
+ rspec (3.9.0)
19
+ rspec-core (~> 3.9.0)
20
+ rspec-expectations (~> 3.9.0)
21
+ rspec-mocks (~> 3.9.0)
22
+ rspec-core (3.9.2)
23
+ rspec-support (~> 3.9.3)
24
+ rspec-expectations (3.9.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-mocks (3.9.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.9.0)
30
+ rspec-support (3.9.3)
17
31
 
18
32
  PLATFORMS
19
33
  ruby
@@ -21,6 +35,7 @@ PLATFORMS
21
35
  DEPENDENCIES
22
36
  minitest (~> 5.0)
23
37
  rake (~> 12.0)
38
+ rspec
24
39
  twitterscraper-ruby!
25
40
 
26
41
  BUNDLED WITH
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # twitterscraper-ruby
2
2
 
3
+ [![Build Status](https://circleci.com/gh/ts-3156/twitterscraper-ruby.svg?style=svg)](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
3
4
  [![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
4
5
 
5
6
  A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
@@ -33,7 +34,7 @@ Command-line interface:
33
34
 
34
35
  ```shell script
35
36
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
37
+ --limit 100 --threads 10 --output output.json
37
38
  ```
38
39
 
39
40
  From Within Ruby:
@@ -47,10 +48,9 @@ options = {
47
48
  lang: 'ja',
48
49
  limit: 100,
49
50
  threads: 10,
50
- proxy: true
51
51
  }
52
52
 
53
- client = Twitterscraper::Client.new
53
+ client = Twitterscraper::Client.new(cache: true, proxy: true)
54
54
  tweets = client.query_tweets(KEYWORD, options)
55
55
 
56
56
  tweets.each do |tweet|
@@ -137,13 +137,17 @@ $ cat tweets.json | jq . | less
137
137
  | ------------- | ------------- | ------------- |
138
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
139
139
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
142
  | `--lang` | Retrieve tweets written in a specific language. | |
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
+ | `--order` | Sort order of the results. | desc |
144
145
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
- | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--proxy` | Scrape https://twitter.com/search via proxies. | true |
147
+ | `--cache` | Enable caching. | true |
148
+ | `--format` | The format of the output. | json |
146
149
  | `--output` | The name of the output file. | tweets.json |
150
+ | `--verbose` | Print debug messages. | tweets.json |
147
151
 
148
152
 
149
153
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
8
10
  require 'version'
9
11
 
10
12
  module Twitterscraper
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -20,12 +20,25 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
24
+ order: options['order'],
23
25
  threads: options['threads'],
24
- proxy: options['proxy']
25
26
  }
26
- client = Twitterscraper::Client.new
27
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
27
28
  tweets = client.query_tweets(options['query'], query_options)
28
- File.write(options['output'], generate_json(tweets)) unless tweets.empty?
29
+ export(tweets) unless tweets.empty?
30
+ end
31
+
32
+ def export(tweets)
33
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
34
+
35
+ if options['format'] == 'json'
36
+ write_json.call
37
+ elsif options['format'] == 'html'
38
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
39
+ else
40
+ write_json.call
41
+ end
29
42
  end
30
43
 
31
44
  def generate_json(tweets)
@@ -51,17 +64,28 @@ module Twitterscraper
51
64
  'end_date:',
52
65
  'lang:',
53
66
  'limit:',
67
+ 'daily_limit:',
68
+ 'order:',
54
69
  'threads:',
55
70
  'output:',
56
- 'proxy',
71
+ 'format:',
72
+ 'cache:',
73
+ 'proxy:',
57
74
  'pretty',
58
75
  'verbose',
59
76
  )
60
77
 
78
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
61
79
  options['lang'] ||= ''
62
80
  options['limit'] = (options['limit'] || 100).to_i
81
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
63
82
  options['threads'] = (options['threads'] || 2).to_i
64
- options['output'] ||= 'tweets.json'
83
+ options['format'] ||= 'json'
84
+ options['order'] ||= 'desc'
85
+ options['output'] ||= "tweets.#{options['format']}"
86
+
87
+ options['cache'] = options['cache'] != 'false'
88
+ options['proxy'] = options['proxy'] != 'false'
65
89
 
66
90
  options
67
91
  end
@@ -86,7 +110,7 @@ module Twitterscraper
86
110
  end
87
111
 
88
112
  def print_version
89
- puts "twitterscraper-#{Twitterscraper::VERSION}"
113
+ puts "twitterscraper-#{VERSION}"
90
114
  end
91
115
  end
92
116
  end
@@ -1,5 +1,18 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
4
17
  end
5
18
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,14 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ http_request = lambda do
83
+ logger.debug "Scraping tweets from #{url}"
84
+ get_single_page(url, headers, proxies)
85
+ end
79
86
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
87
+ if cache_enabled?
88
+ client = Cache.new
89
+ if (response = client.read(url))
90
+ logger.debug 'Fetching tweets from cache'
91
+ else
92
+ response = http_request.call
93
+ client.write(url, response) unless stop_requested?
94
+ end
95
+ else
96
+ response = http_request.call
97
+ end
98
+ return [], nil if response.nil? || response.empty?
82
99
 
83
100
  html, json_resp = parse_single_page(response, pos.nil?)
84
101
 
@@ -99,33 +116,34 @@ module Twitterscraper
99
116
 
100
117
  OLDEST_DATE = Date.parse('2006-03-21')
101
118
 
102
- def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
119
+ def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
120
+ query = queries[0]
103
121
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
122
+ raise Error.new('Please specify a search query.')
105
123
  end
106
124
 
107
125
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
126
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
127
  end
110
128
 
111
129
  if start_date && end_date
112
130
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
131
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
132
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
133
+ raise Error.new(':start_date must occur before :end_date.')
116
134
  end
117
135
  end
118
136
 
119
137
  if start_date
120
138
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
139
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
140
  end
123
141
  end
124
142
 
125
143
  if end_date
126
144
  today = Date.today
127
145
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
146
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
147
  end
130
148
  end
131
149
  end
@@ -143,27 +161,32 @@ module Twitterscraper
143
161
  end
144
162
  end
145
163
 
146
- def main_loop(query, lang, limit, headers, proxies)
164
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
147
165
  pos = nil
166
+ daily_tweets = []
148
167
 
149
168
  while true
150
169
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
151
170
  unless new_tweets.empty?
171
+ daily_tweets.concat(new_tweets)
172
+ daily_tweets.uniq! { |t| t.tweet_id }
173
+
152
174
  @mutex.synchronize {
153
175
  @all_tweets.concat(new_tweets)
154
176
  @all_tweets.uniq! { |t| t.tweet_id }
155
177
  }
156
178
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
179
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
180
 
159
181
  break unless new_pos
182
+ break if daily_limit && daily_tweets.size >= daily_limit
160
183
  break if @all_tweets.size >= limit
161
184
 
162
185
  pos = new_pos
163
186
  end
164
187
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
188
+ if !@stop_requested && @all_tweets.size >= limit
189
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
190
  @stop_requested = true
168
191
  end
169
192
  end
@@ -172,37 +195,51 @@ module Twitterscraper
172
195
  @stop_requested
173
196
  end
174
197
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
198
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
176
199
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
200
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
201
  queries = build_queries(query, start_date, end_date)
179
- threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ if threads > queries.size
203
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
204
+ threads = queries.size
205
+ end
206
+ if proxy_enabled?
207
+ proxies = Proxy::Pool.new
208
+ logger.debug "Fetch #{proxies.size} proxies"
209
+ else
210
+ proxies = []
211
+ logger.debug 'Proxy disabled'
212
+ end
213
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
181
214
 
182
- validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
215
 
184
- logger.info("The number of threads #{threads}")
216
+ validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
217
+
218
+ logger.info "The number of threads #{threads}"
185
219
 
186
220
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
221
+ logger.info "Headers #{headers}"
188
222
 
189
223
  @all_tweets = []
190
224
  @mutex = Mutex.new
191
225
  @stop_requested = false
192
226
 
193
227
  if threads > 1
228
+ Thread.abort_on_exception = true
229
+ logger.debug "Set 'Thread.abort_on_exception' to true"
230
+
194
231
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
232
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
196
233
  raise Parallel::Break if stop_requested?
197
234
  end
198
235
  else
199
236
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
237
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
201
238
  break if stop_requested?
202
239
  end
203
240
  end
204
241
 
205
- @all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
242
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
206
243
  end
207
244
  end
208
245
  end
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -51,12 +59,19 @@ module Twitterscraper
51
59
  def from_tweets_html(html)
52
60
  html.map do |tweet|
53
61
  from_tweet_html(tweet)
54
- end
62
+ end.compact
55
63
  end
56
64
 
57
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
58
74
  inner_html = Nokogiri::HTML(html.inner_html)
59
- tweet_id = html.attr('data-tweet-id').to_i
60
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
61
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
62
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -81,7 +96,7 @@ module Twitterscraper
81
96
 
82
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
83
98
  new(
84
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
85
100
  name: html.attr('data-name'),
86
101
  user_id: html.attr('data-user-id').to_i,
87
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.9.0'
2
+ VERSION = '0.14.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -46,8 +46,10 @@ executables:
46
46
  extensions: []
47
47
  extra_rdoc_files: []
48
48
  files:
49
+ - ".circleci/config.yml"
49
50
  - ".gitignore"
50
51
  - ".irbrc"
52
+ - ".rspec"
51
53
  - ".ruby-version"
52
54
  - ".travis.yml"
53
55
  - CODE_OF_CONDUCT.md
@@ -61,6 +63,7 @@ files:
61
63
  - bin/twitterscraper
62
64
  - lib/twitterscraper-ruby.rb
63
65
  - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
64
67
  - lib/twitterscraper/cli.rb
65
68
  - lib/twitterscraper/client.rb
66
69
  - lib/twitterscraper/http.rb
@@ -68,6 +71,7 @@ files:
68
71
  - lib/twitterscraper/logger.rb
69
72
  - lib/twitterscraper/proxy.rb
70
73
  - lib/twitterscraper/query.rb
74
+ - lib/twitterscraper/template.rb
71
75
  - lib/twitterscraper/tweet.rb
72
76
  - lib/version.rb
73
77
  - twitterscraper-ruby.gemspec