twitterscraper-ruby 0.9.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 59b71fc6129f6d8c5a441981dc1577fa9b761380ff119bed4985cfcd88ccb31b
4
- data.tar.gz: 2de3fcadc334ee2689d3083ea9324127c3b22ec94cf1b08dec920f9c95771445
3
+ metadata.gz: cf902c947e866cc99e79fbb9f8a51c829accd44aed03ef7657562bf41932c73d
4
+ data.tar.gz: 1bc5a0698a17b244ee9228d7728767dd00218179a5a49e0852a74cc722322ef0
5
5
  SHA512:
6
- metadata.gz: b1e392bc021f6f758b79b7bdcd099af2ac391863f8712dadb5fd19248946867cfd89f140b836532fb40554c82697b26ef3af00b7cbb2cb13b0d5a8e2a38c87e7
7
- data.tar.gz: 8c0e81589202e4a094c17604354f0f23a08b4536fe60b58ffe616cf1233c0531547ef02b8e88b6f70b1870ce2d134e4518ee093a5349144e2edfce3b1088e06c
6
+ metadata.gz: 629de8698af1391c210b496e9aadb51ad5f9d7157b1be5d0aa669ae821671e2b5624ba51083fb14b61f93618ff3e90aea1ac0eccb6ea00360fac48a2dfc436c7
7
+ data.tar.gz: 3f3706bee5f2a92a2addae034201e2e8cee3fef43efdc323be963cbaf1b94c31c53aa49a19e58a068498722dfe07e9796e097fb04364a9afda56d06132e6b935
@@ -0,0 +1,31 @@
1
+ version: 2.1
2
+ orbs:
3
+ ruby: circleci/ruby@0.1.2
4
+
5
+ jobs:
6
+ build:
7
+ docker:
8
+ - image: circleci/ruby:2.6.4-stretch-node
9
+ environment:
10
+ BUNDLER_VERSION: 2.1.4
11
+ executor: ruby/default
12
+ steps:
13
+ - checkout
14
+ - run:
15
+ name: Update bundler
16
+ command: gem update bundler
17
+ - run:
18
+ name: Which bundler?
19
+ command: bundle -v
20
+ - restore_cache:
21
+ keys:
22
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
23
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}
24
+ - gem-cache-v1
25
+ - run: bundle install --path vendor/bundle
26
+ - run: bundle clean
27
+ - save_cache:
28
+ key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
29
+ paths:
30
+ - vendor/bundle
31
+ - run: bundle exec rspec
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ -fd
2
+ --require spec_helper
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "minitest", "~> 5.0"
8
+ gem "rspec"
@@ -1,19 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.9.0)
4
+ twitterscraper-ruby (0.14.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
+ diff-lcs (1.4.4)
11
12
  mini_portile2 (2.4.0)
12
13
  minitest (5.14.1)
13
14
  nokogiri (1.10.10)
14
15
  mini_portile2 (~> 2.4.0)
15
16
  parallel (1.19.2)
16
17
  rake (12.3.3)
18
+ rspec (3.9.0)
19
+ rspec-core (~> 3.9.0)
20
+ rspec-expectations (~> 3.9.0)
21
+ rspec-mocks (~> 3.9.0)
22
+ rspec-core (3.9.2)
23
+ rspec-support (~> 3.9.3)
24
+ rspec-expectations (3.9.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-mocks (3.9.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.9.0)
30
+ rspec-support (3.9.3)
17
31
 
18
32
  PLATFORMS
19
33
  ruby
@@ -21,6 +35,7 @@ PLATFORMS
21
35
  DEPENDENCIES
22
36
  minitest (~> 5.0)
23
37
  rake (~> 12.0)
38
+ rspec
24
39
  twitterscraper-ruby!
25
40
 
26
41
  BUNDLED WITH
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # twitterscraper-ruby
2
2
 
3
+ [![Build Status](https://circleci.com/gh/ts-3156/twitterscraper-ruby.svg?style=svg)](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
3
4
  [![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
4
5
 
5
6
  A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
@@ -33,7 +34,7 @@ Command-line interface:
33
34
 
34
35
  ```shell script
35
36
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
37
+ --limit 100 --threads 10 --output output.json
37
38
  ```
38
39
 
39
40
  From Within Ruby:
@@ -47,10 +48,9 @@ options = {
47
48
  lang: 'ja',
48
49
  limit: 100,
49
50
  threads: 10,
50
- proxy: true
51
51
  }
52
52
 
53
- client = Twitterscraper::Client.new
53
+ client = Twitterscraper::Client.new(cache: true, proxy: true)
54
54
  tweets = client.query_tweets(KEYWORD, options)
55
55
 
56
56
  tweets.each do |tweet|
@@ -137,13 +137,17 @@ $ cat tweets.json | jq . | less
137
137
  | ------------- | ------------- | ------------- |
138
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
139
139
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
142
  | `--lang` | Retrieve tweets written in a specific language. | |
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
+ | `--order` | Sort order of the results. | desc |
144
145
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
- | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--proxy` | Scrape https://twitter.com/search via proxies. | true |
147
+ | `--cache` | Enable caching. | true |
148
+ | `--format` | The format of the output. | json |
146
149
  | `--output` | The name of the output file. | tweets.json |
150
+ | `--verbose` | Print debug messages. | tweets.json |
147
151
 
148
152
 
149
153
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
8
10
  require 'version'
9
11
 
10
12
  module Twitterscraper
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -20,12 +20,25 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
24
+ order: options['order'],
23
25
  threads: options['threads'],
24
- proxy: options['proxy']
25
26
  }
26
- client = Twitterscraper::Client.new
27
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
27
28
  tweets = client.query_tweets(options['query'], query_options)
28
- File.write(options['output'], generate_json(tweets)) unless tweets.empty?
29
+ export(tweets) unless tweets.empty?
30
+ end
31
+
32
+ def export(tweets)
33
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
34
+
35
+ if options['format'] == 'json'
36
+ write_json.call
37
+ elsif options['format'] == 'html'
38
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
39
+ else
40
+ write_json.call
41
+ end
29
42
  end
30
43
 
31
44
  def generate_json(tweets)
@@ -51,17 +64,28 @@ module Twitterscraper
51
64
  'end_date:',
52
65
  'lang:',
53
66
  'limit:',
67
+ 'daily_limit:',
68
+ 'order:',
54
69
  'threads:',
55
70
  'output:',
56
- 'proxy',
71
+ 'format:',
72
+ 'cache:',
73
+ 'proxy:',
57
74
  'pretty',
58
75
  'verbose',
59
76
  )
60
77
 
78
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
61
79
  options['lang'] ||= ''
62
80
  options['limit'] = (options['limit'] || 100).to_i
81
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
63
82
  options['threads'] = (options['threads'] || 2).to_i
64
- options['output'] ||= 'tweets.json'
83
+ options['format'] ||= 'json'
84
+ options['order'] ||= 'desc'
85
+ options['output'] ||= "tweets.#{options['format']}"
86
+
87
+ options['cache'] = options['cache'] != 'false'
88
+ options['proxy'] = options['proxy'] != 'false'
65
89
 
66
90
  options
67
91
  end
@@ -86,7 +110,7 @@ module Twitterscraper
86
110
  end
87
111
 
88
112
  def print_version
89
- puts "twitterscraper-#{Twitterscraper::VERSION}"
113
+ puts "twitterscraper-#{VERSION}"
90
114
  end
91
115
  end
92
116
  end
@@ -1,5 +1,18 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
4
17
  end
5
18
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,14 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ http_request = lambda do
83
+ logger.debug "Scraping tweets from #{url}"
84
+ get_single_page(url, headers, proxies)
85
+ end
79
86
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
87
+ if cache_enabled?
88
+ client = Cache.new
89
+ if (response = client.read(url))
90
+ logger.debug 'Fetching tweets from cache'
91
+ else
92
+ response = http_request.call
93
+ client.write(url, response) unless stop_requested?
94
+ end
95
+ else
96
+ response = http_request.call
97
+ end
98
+ return [], nil if response.nil? || response.empty?
82
99
 
83
100
  html, json_resp = parse_single_page(response, pos.nil?)
84
101
 
@@ -99,33 +116,34 @@ module Twitterscraper
99
116
 
100
117
  OLDEST_DATE = Date.parse('2006-03-21')
101
118
 
102
- def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
119
+ def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
120
+ query = queries[0]
103
121
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
122
+ raise Error.new('Please specify a search query.')
105
123
  end
106
124
 
107
125
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
126
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
127
  end
110
128
 
111
129
  if start_date && end_date
112
130
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
131
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
132
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
133
+ raise Error.new(':start_date must occur before :end_date.')
116
134
  end
117
135
  end
118
136
 
119
137
  if start_date
120
138
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
139
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
140
  end
123
141
  end
124
142
 
125
143
  if end_date
126
144
  today = Date.today
127
145
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
146
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
147
  end
130
148
  end
131
149
  end
@@ -143,27 +161,32 @@ module Twitterscraper
143
161
  end
144
162
  end
145
163
 
146
- def main_loop(query, lang, limit, headers, proxies)
164
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
147
165
  pos = nil
166
+ daily_tweets = []
148
167
 
149
168
  while true
150
169
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
151
170
  unless new_tweets.empty?
171
+ daily_tweets.concat(new_tweets)
172
+ daily_tweets.uniq! { |t| t.tweet_id }
173
+
152
174
  @mutex.synchronize {
153
175
  @all_tweets.concat(new_tweets)
154
176
  @all_tweets.uniq! { |t| t.tweet_id }
155
177
  }
156
178
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
179
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
180
 
159
181
  break unless new_pos
182
+ break if daily_limit && daily_tweets.size >= daily_limit
160
183
  break if @all_tweets.size >= limit
161
184
 
162
185
  pos = new_pos
163
186
  end
164
187
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
188
+ if !@stop_requested && @all_tweets.size >= limit
189
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
190
  @stop_requested = true
168
191
  end
169
192
  end
@@ -172,37 +195,51 @@ module Twitterscraper
172
195
  @stop_requested
173
196
  end
174
197
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
198
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
176
199
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
200
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
201
  queries = build_queries(query, start_date, end_date)
179
- threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ if threads > queries.size
203
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
204
+ threads = queries.size
205
+ end
206
+ if proxy_enabled?
207
+ proxies = Proxy::Pool.new
208
+ logger.debug "Fetch #{proxies.size} proxies"
209
+ else
210
+ proxies = []
211
+ logger.debug 'Proxy disabled'
212
+ end
213
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
181
214
 
182
- validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
215
 
184
- logger.info("The number of threads #{threads}")
216
+ validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
217
+
218
+ logger.info "The number of threads #{threads}"
185
219
 
186
220
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
221
+ logger.info "Headers #{headers}"
188
222
 
189
223
  @all_tweets = []
190
224
  @mutex = Mutex.new
191
225
  @stop_requested = false
192
226
 
193
227
  if threads > 1
228
+ Thread.abort_on_exception = true
229
+ logger.debug "Set 'Thread.abort_on_exception' to true"
230
+
194
231
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
232
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
196
233
  raise Parallel::Break if stop_requested?
197
234
  end
198
235
  else
199
236
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
237
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
201
238
  break if stop_requested?
202
239
  end
203
240
  end
204
241
 
205
- @all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
242
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
206
243
  end
207
244
  end
208
245
  end
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -51,12 +59,19 @@ module Twitterscraper
51
59
  def from_tweets_html(html)
52
60
  html.map do |tweet|
53
61
  from_tweet_html(tweet)
54
- end
62
+ end.compact
55
63
  end
56
64
 
57
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
58
74
  inner_html = Nokogiri::HTML(html.inner_html)
59
- tweet_id = html.attr('data-tweet-id').to_i
60
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
61
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
62
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -81,7 +96,7 @@ module Twitterscraper
81
96
 
82
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
83
98
  new(
84
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
85
100
  name: html.attr('data-name'),
86
101
  user_id: html.attr('data-user-id').to_i,
87
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.9.0'
2
+ VERSION = '0.14.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -46,8 +46,10 @@ executables:
46
46
  extensions: []
47
47
  extra_rdoc_files: []
48
48
  files:
49
+ - ".circleci/config.yml"
49
50
  - ".gitignore"
50
51
  - ".irbrc"
52
+ - ".rspec"
51
53
  - ".ruby-version"
52
54
  - ".travis.yml"
53
55
  - CODE_OF_CONDUCT.md
@@ -61,6 +63,7 @@ files:
61
63
  - bin/twitterscraper
62
64
  - lib/twitterscraper-ruby.rb
63
65
  - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
64
67
  - lib/twitterscraper/cli.rb
65
68
  - lib/twitterscraper/client.rb
66
69
  - lib/twitterscraper/http.rb
@@ -68,6 +71,7 @@ files:
68
71
  - lib/twitterscraper/logger.rb
69
72
  - lib/twitterscraper/proxy.rb
70
73
  - lib/twitterscraper/query.rb
74
+ - lib/twitterscraper/template.rb
71
75
  - lib/twitterscraper/tweet.rb
72
76
  - lib/version.rb
73
77
  - twitterscraper-ruby.gemspec