twitterscraper-ruby 0.10.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2429cf6172b5f19caede64ac35f5c796a7c8a67e76fff8dd2f08677fb15406b
4
- data.tar.gz: 0f32ca6b559a18c4e3aac3205f6503149e372d4d7d1976b1e83db26036d9ff17
3
+ metadata.gz: a950fb24329aaa1020441e258a8a2144100d732142b6c227bb9b026b8bb73996
4
+ data.tar.gz: 1f64f31e43189e2ee439f5ef6f6d54bc6ea58895adbed67cb8ddbe91af07681a
5
5
  SHA512:
6
- metadata.gz: a36ce6c91a363b64b36deeb3abbaaaebb725f3449f280b70be92532497a94dc5915ba449926acfacfc0d852d52471d258d41140a8891e64b6040bf262d0c347f
7
- data.tar.gz: a737c7db151190a1493b1a2a92bea304cfcf7512b2ee03fc13c6f25794f5dc727fe548e52cb39eccc2a63261fee0d58fc005920a0e7cd7650d20600e184d79cb
6
+ metadata.gz: 8573affbc9a5faa05e5e489364bb2ba0da1aa4f12af35445e5de8b1f8c399eb0575cc9f408b2ba96c3d7fd8b2a74b7dd703229053a33c1f8a883856818033cb9
7
+ data.tar.gz: 2b2b3ad0b2dd9d089a7b6127ed1b0db21e7f4fa5f0c31e6b366d9b5ae444e2244d4200c813b7a3257f43702d2caa9f264515e701602c24f4482a746b89d41328
@@ -0,0 +1,31 @@
1
+ version: 2.1
2
+ orbs:
3
+ ruby: circleci/ruby@0.1.2
4
+
5
+ jobs:
6
+ build:
7
+ docker:
8
+ - image: circleci/ruby:2.6.4-stretch-node
9
+ environment:
10
+ BUNDLER_VERSION: 2.1.4
11
+ executor: ruby/default
12
+ steps:
13
+ - checkout
14
+ - run:
15
+ name: Update bundler
16
+ command: gem update bundler
17
+ - run:
18
+ name: Which bundler?
19
+ command: bundle -v
20
+ - restore_cache:
21
+ keys:
22
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
23
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}
24
+ - gem-cache-v1
25
+ - run: bundle install --path vendor/bundle
26
+ - run: bundle clean
27
+ - save_cache:
28
+ key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
29
+ paths:
30
+ - vendor/bundle
31
+ - run: bundle exec rspec
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ -fd
2
+ --require spec_helper
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "minitest", "~> 5.0"
8
+ gem "rspec"
@@ -1,19 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.10.0)
4
+ twitterscraper-ruby (0.15.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
+ diff-lcs (1.4.4)
11
12
  mini_portile2 (2.4.0)
12
13
  minitest (5.14.1)
13
14
  nokogiri (1.10.10)
14
15
  mini_portile2 (~> 2.4.0)
15
16
  parallel (1.19.2)
16
17
  rake (12.3.3)
18
+ rspec (3.9.0)
19
+ rspec-core (~> 3.9.0)
20
+ rspec-expectations (~> 3.9.0)
21
+ rspec-mocks (~> 3.9.0)
22
+ rspec-core (3.9.2)
23
+ rspec-support (~> 3.9.3)
24
+ rspec-expectations (3.9.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-mocks (3.9.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.9.0)
30
+ rspec-support (3.9.3)
17
31
 
18
32
  PLATFORMS
19
33
  ruby
@@ -21,6 +35,7 @@ PLATFORMS
21
35
  DEPENDENCIES
22
36
  minitest (~> 5.0)
23
37
  rake (~> 12.0)
38
+ rspec
24
39
  twitterscraper-ruby!
25
40
 
26
41
  BUNDLED WITH
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # twitterscraper-ruby
2
2
 
3
+ [![Build Status](https://circleci.com/gh/ts-3156/twitterscraper-ruby.svg?style=svg)](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
3
4
  [![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
4
5
 
5
6
  A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
@@ -32,27 +33,39 @@ $ gem install twitterscraper-ruby
32
33
  Command-line interface:
33
34
 
34
35
  ```shell script
35
- $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ # Returns a collection of relevant tweets matching a specified query.
37
+ $ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
38
+ --limit 100 --threads 10 --output tweets.json
39
+ ```
40
+
41
+ ```shell script
42
+ # Returns a collection of the most recent tweets posted by the user indicated by the screen_name
43
+ $ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
37
44
  ```
38
45
 
39
46
  From Within Ruby:
40
47
 
41
48
  ```ruby
42
49
  require 'twitterscraper'
50
+ client = Twitterscraper::Client.new(cache: true, proxy: true)
51
+ ```
52
+
53
+ ```ruby
54
+ # Returns a collection of relevant tweets matching a specified query.
55
+ tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
56
+ ```
43
57
 
44
- options = {
45
- start_date: '2020-06-01',
46
- end_date: '2020-06-30',
47
- lang: 'ja',
48
- limit: 100,
49
- threads: 10,
50
- proxy: true
51
- }
58
+ ```ruby
59
+ # Returns a collection of the most recent tweets posted by the user indicated by the screen_name
60
+ tweets = client.user_timeline(SCREEN_NAME, limit: 100)
61
+ ```
52
62
 
53
- client = Twitterscraper::Client.new
54
- tweets = client.query_tweets(KEYWORD, options)
55
63
 
64
+ ## Attributes
65
+
66
+ ### Tweet
67
+
68
+ ```ruby
56
69
  tweets.each do |tweet|
57
70
  puts tweet.tweet_id
58
71
  puts tweet.text
@@ -64,11 +77,6 @@ tweets.each do |tweet|
64
77
  end
65
78
  ```
66
79
 
67
-
68
- ## Attributes
69
-
70
- ### Tweet
71
-
72
80
  - screen_name
73
81
  - name
74
82
  - user_id
@@ -136,15 +144,19 @@ $ cat tweets.json | jq . | less
136
144
  | Option | Description | Default |
137
145
  | ------------- | ------------- | ------------- |
138
146
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
147
+ | `--type` | Specify a search type. | search |
139
148
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
149
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
150
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
151
  | `--lang` | Retrieve tweets written in a specific language. | |
143
152
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
153
+ | `--order` | Sort order of the results. | desc |
144
154
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
- | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
155
+ | `--proxy` | Scrape https://twitter.com/search via proxies. | true |
156
+ | `--cache` | Enable caching. | true |
146
157
  | `--format` | The format of the output. | json |
147
158
  | `--output` | The name of the output file. | tweets.json |
159
+ | `--verbose` | Print debug messages. | tweets.json |
148
160
 
149
161
 
150
162
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,6 +2,7 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -16,14 +16,16 @@ module Twitterscraper
16
16
  print_version || return if print_version?
17
17
 
18
18
  query_options = {
19
+ type: options['type'],
19
20
  start_date: options['start_date'],
20
21
  end_date: options['end_date'],
21
22
  lang: options['lang'],
22
23
  limit: options['limit'],
24
+ daily_limit: options['daily_limit'],
25
+ order: options['order'],
23
26
  threads: options['threads'],
24
- proxy: options['proxy']
25
27
  }
26
- client = Twitterscraper::Client.new
28
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
27
29
  tweets = client.query_tweets(options['query'], query_options)
28
30
  export(tweets) unless tweets.empty?
29
31
  end
@@ -58,25 +60,36 @@ module Twitterscraper
58
60
  'help',
59
61
  'v',
60
62
  'version',
63
+ 'type:',
61
64
  'query:',
62
65
  'start_date:',
63
66
  'end_date:',
64
67
  'lang:',
65
68
  'limit:',
69
+ 'daily_limit:',
70
+ 'order:',
66
71
  'threads:',
67
72
  'output:',
68
73
  'format:',
69
- 'proxy',
74
+ 'cache:',
75
+ 'proxy:',
70
76
  'pretty',
71
77
  'verbose',
72
78
  )
73
79
 
80
+ options['type'] ||= 'search'
81
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
74
82
  options['lang'] ||= ''
75
83
  options['limit'] = (options['limit'] || 100).to_i
84
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
76
85
  options['threads'] = (options['threads'] || 2).to_i
77
86
  options['format'] ||= 'json'
87
+ options['order'] ||= 'desc'
78
88
  options['output'] ||= "tweets.#{options['format']}"
79
89
 
90
+ options['cache'] = options['cache'] != 'false'
91
+ options['proxy'] = options['proxy'] != 'false'
92
+
80
93
  options
81
94
  end
82
95
 
@@ -100,7 +113,7 @@ module Twitterscraper
100
113
  end
101
114
 
102
115
  def print_version
103
- puts "twitterscraper-#{Twitterscraper::VERSION}"
116
+ puts "twitterscraper-#{VERSION}"
104
117
  end
105
118
  end
106
119
  end
@@ -1,5 +1,18 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
4
17
  end
5
18
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -22,36 +22,41 @@ module Twitterscraper
22
22
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
23
  'default&include_available_features=1&include_entities=1&' +
24
24
  'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
25
- INIT_URL_USER = 'https://twitter.com/{u}'
26
- RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/{u}/timeline/tweets?' +
25
+ INIT_URL_USER = 'https://twitter.com/__USER__'
26
+ RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
27
27
  'include_available_features=1&include_entities=1&' +
28
- 'max_position={pos}&reset_error_state=false'
29
-
30
- def build_query_url(query, lang, pos, from_user = false)
31
- # if from_user
32
- # if !pos
33
- # INIT_URL_USER.format(u = query)
34
- # else
35
- # RELOAD_URL_USER.format(u = query, pos = pos)
36
- # end
37
- # end
38
- if pos
39
- RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
28
+ 'max_position=__POS__&reset_error_state=false'
29
+
30
+ def build_query_url(query, lang, from_user, pos)
31
+ if from_user
32
+ if pos
33
+ RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
+ else
35
+ INIT_URL_USER.sub('__USER__', query)
36
+ end
40
37
  else
41
- INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
38
+ if pos
39
+ RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
40
+ else
41
+ INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
42
+ end
42
43
  end
43
44
  end
44
45
 
45
46
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
47
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
48
+ unless proxies.empty?
49
+ proxy = proxies.sample
50
+ logger.info("Using proxy #{proxy}")
51
+ end
52
+ Http.get(url, headers, proxy, timeout)
48
53
  rescue => e
49
54
  logger.debug "query_single_page: #{e.inspect}"
50
55
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
56
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
57
  retry
53
58
  else
54
- raise
59
+ raise Error.new("#{e.inspect} url=#{url}")
55
60
  end
56
61
  end
57
62
 
@@ -70,15 +75,28 @@ module Twitterscraper
70
75
  [items_html, json_resp]
71
76
  end
72
77
 
73
- def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ def query_single_page(query, lang, type, pos, headers: [], proxies: [])
79
+ logger.info "Querying #{query}"
75
80
  query = ERB::Util.url_encode(query)
76
81
 
77
- url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ url = build_query_url(query, lang, type == 'user', pos)
83
+ http_request = lambda do
84
+ logger.debug "Scraping tweets from #{url}"
85
+ get_single_page(url, headers, proxies)
86
+ end
79
87
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
88
+ if cache_enabled?
89
+ client = Cache.new
90
+ if (response = client.read(url))
91
+ logger.debug 'Fetching tweets from cache'
92
+ else
93
+ response = http_request.call
94
+ client.write(url, response) unless stop_requested?
95
+ end
96
+ else
97
+ response = http_request.call
98
+ end
99
+ return [], nil if response.nil? || response.empty?
82
100
 
83
101
  html, json_resp = parse_single_page(response, pos.nil?)
84
102
 
@@ -90,8 +108,8 @@ module Twitterscraper
90
108
 
91
109
  if json_resp
92
110
  [tweets, json_resp['min_position']]
93
- elsif from_user
94
- raise NotImplementedError
111
+ elsif type
112
+ [tweets, tweets[-1].tweet_id]
95
113
  else
96
114
  [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
97
115
  end
@@ -99,33 +117,34 @@ module Twitterscraper
99
117
 
100
118
  OLDEST_DATE = Date.parse('2006-03-21')
101
119
 
102
- def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
120
+ def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
121
+ query = queries[0]
103
122
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
123
+ raise Error.new('Please specify a search query.')
105
124
  end
106
125
 
107
126
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
127
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
128
  end
110
129
 
111
130
  if start_date && end_date
112
131
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
132
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
133
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
134
+ raise Error.new(':start_date must occur before :end_date.')
116
135
  end
117
136
  end
118
137
 
119
138
  if start_date
120
139
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
140
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
141
  end
123
142
  end
124
143
 
125
144
  if end_date
126
145
  today = Date.today
127
146
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
147
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
148
  end
130
149
  end
131
150
  end
@@ -143,27 +162,32 @@ module Twitterscraper
143
162
  end
144
163
  end
145
164
 
146
- def main_loop(query, lang, limit, headers, proxies)
165
+ def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
147
166
  pos = nil
167
+ daily_tweets = []
148
168
 
149
169
  while true
150
- new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
170
+ new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
151
171
  unless new_tweets.empty?
172
+ daily_tweets.concat(new_tweets)
173
+ daily_tweets.uniq! { |t| t.tweet_id }
174
+
152
175
  @mutex.synchronize {
153
176
  @all_tweets.concat(new_tweets)
154
177
  @all_tweets.uniq! { |t| t.tweet_id }
155
178
  }
156
179
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
180
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
181
 
159
182
  break unless new_pos
183
+ break if daily_limit && daily_tweets.size >= daily_limit
160
184
  break if @all_tweets.size >= limit
161
185
 
162
186
  pos = new_pos
163
187
  end
164
188
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
189
+ if !@stop_requested && @all_tweets.size >= limit
190
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
191
  @stop_requested = true
168
192
  end
169
193
  end
@@ -172,37 +196,59 @@ module Twitterscraper
172
196
  @stop_requested
173
197
  end
174
198
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
199
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
176
200
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
201
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
202
  queries = build_queries(query, start_date, end_date)
179
- threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
203
+ if threads > queries.size
204
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
+ threads = queries.size
206
+ end
207
+ if proxy_enabled?
208
+ proxies = Proxy::Pool.new
209
+ logger.debug "Fetch #{proxies.size} proxies"
210
+ else
211
+ proxies = []
212
+ logger.debug 'Proxy disabled'
213
+ end
214
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
181
215
 
182
- validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
216
 
184
- logger.info("The number of threads #{threads}")
217
+ validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
+
219
+ logger.info "The number of threads #{threads}"
185
220
 
186
221
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
222
+ logger.info "Headers #{headers}"
188
223
 
189
224
  @all_tweets = []
190
225
  @mutex = Mutex.new
191
226
  @stop_requested = false
192
227
 
193
228
  if threads > 1
229
+ Thread.abort_on_exception = true
230
+ logger.debug "Set 'Thread.abort_on_exception' to true"
231
+
194
232
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
233
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
196
234
  raise Parallel::Break if stop_requested?
197
235
  end
198
236
  else
199
237
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
238
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
201
239
  break if stop_requested?
202
240
  end
203
241
  end
204
242
 
205
- @all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
243
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
+ end
245
+
246
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
248
+ end
249
+
250
+ def user_timeline(screen_name, limit: 100, order: 'desc')
251
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
206
252
  end
207
253
  end
208
254
  end
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -51,12 +59,19 @@ module Twitterscraper
51
59
  def from_tweets_html(html)
52
60
  html.map do |tweet|
53
61
  from_tweet_html(tweet)
54
- end
62
+ end.compact
55
63
  end
56
64
 
57
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
58
74
  inner_html = Nokogiri::HTML(html.inner_html)
59
- tweet_id = html.attr('data-tweet-id').to_i
60
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
61
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
62
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -81,7 +96,7 @@ module Twitterscraper
81
96
 
82
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
83
98
  new(
84
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
85
100
  name: html.attr('data-name'),
86
101
  user_id: html.attr('data-user-id').to_i,
87
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.10.0'
2
+ VERSION = '0.15.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -46,8 +46,10 @@ executables:
46
46
  extensions: []
47
47
  extra_rdoc_files: []
48
48
  files:
49
+ - ".circleci/config.yml"
49
50
  - ".gitignore"
50
51
  - ".irbrc"
52
+ - ".rspec"
51
53
  - ".ruby-version"
52
54
  - ".travis.yml"
53
55
  - CODE_OF_CONDUCT.md
@@ -61,6 +63,7 @@ files:
61
63
  - bin/twitterscraper
62
64
  - lib/twitterscraper-ruby.rb
63
65
  - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
64
67
  - lib/twitterscraper/cli.rb
65
68
  - lib/twitterscraper/client.rb
66
69
  - lib/twitterscraper/http.rb