twitterscraper-ruby 0.10.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2429cf6172b5f19caede64ac35f5c796a7c8a67e76fff8dd2f08677fb15406b
4
- data.tar.gz: 0f32ca6b559a18c4e3aac3205f6503149e372d4d7d1976b1e83db26036d9ff17
3
+ metadata.gz: a950fb24329aaa1020441e258a8a2144100d732142b6c227bb9b026b8bb73996
4
+ data.tar.gz: 1f64f31e43189e2ee439f5ef6f6d54bc6ea58895adbed67cb8ddbe91af07681a
5
5
  SHA512:
6
- metadata.gz: a36ce6c91a363b64b36deeb3abbaaaebb725f3449f280b70be92532497a94dc5915ba449926acfacfc0d852d52471d258d41140a8891e64b6040bf262d0c347f
7
- data.tar.gz: a737c7db151190a1493b1a2a92bea304cfcf7512b2ee03fc13c6f25794f5dc727fe548e52cb39eccc2a63261fee0d58fc005920a0e7cd7650d20600e184d79cb
6
+ metadata.gz: 8573affbc9a5faa05e5e489364bb2ba0da1aa4f12af35445e5de8b1f8c399eb0575cc9f408b2ba96c3d7fd8b2a74b7dd703229053a33c1f8a883856818033cb9
7
+ data.tar.gz: 2b2b3ad0b2dd9d089a7b6127ed1b0db21e7f4fa5f0c31e6b366d9b5ae444e2244d4200c813b7a3257f43702d2caa9f264515e701602c24f4482a746b89d41328
@@ -0,0 +1,31 @@
1
+ version: 2.1
2
+ orbs:
3
+ ruby: circleci/ruby@0.1.2
4
+
5
+ jobs:
6
+ build:
7
+ docker:
8
+ - image: circleci/ruby:2.6.4-stretch-node
9
+ environment:
10
+ BUNDLER_VERSION: 2.1.4
11
+ executor: ruby/default
12
+ steps:
13
+ - checkout
14
+ - run:
15
+ name: Update bundler
16
+ command: gem update bundler
17
+ - run:
18
+ name: Which bundler?
19
+ command: bundle -v
20
+ - restore_cache:
21
+ keys:
22
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
23
+ - gem-cache-v1-{{ arch }}-{{ .Branch }}
24
+ - gem-cache-v1
25
+ - run: bundle install --path vendor/bundle
26
+ - run: bundle clean
27
+ - save_cache:
28
+ key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
29
+ paths:
30
+ - vendor/bundle
31
+ - run: bundle exec rspec
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ -fd
2
+ --require spec_helper
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "minitest", "~> 5.0"
8
+ gem "rspec"
@@ -1,19 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.10.0)
4
+ twitterscraper-ruby (0.15.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
+ diff-lcs (1.4.4)
11
12
  mini_portile2 (2.4.0)
12
13
  minitest (5.14.1)
13
14
  nokogiri (1.10.10)
14
15
  mini_portile2 (~> 2.4.0)
15
16
  parallel (1.19.2)
16
17
  rake (12.3.3)
18
+ rspec (3.9.0)
19
+ rspec-core (~> 3.9.0)
20
+ rspec-expectations (~> 3.9.0)
21
+ rspec-mocks (~> 3.9.0)
22
+ rspec-core (3.9.2)
23
+ rspec-support (~> 3.9.3)
24
+ rspec-expectations (3.9.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-mocks (3.9.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.9.0)
30
+ rspec-support (3.9.3)
17
31
 
18
32
  PLATFORMS
19
33
  ruby
@@ -21,6 +35,7 @@ PLATFORMS
21
35
  DEPENDENCIES
22
36
  minitest (~> 5.0)
23
37
  rake (~> 12.0)
38
+ rspec
24
39
  twitterscraper-ruby!
25
40
 
26
41
  BUNDLED WITH
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # twitterscraper-ruby
2
2
 
3
+ [![Build Status](https://circleci.com/gh/ts-3156/twitterscraper-ruby.svg?style=svg)](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
3
4
  [![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
4
5
 
5
6
  A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
@@ -32,27 +33,39 @@ $ gem install twitterscraper-ruby
32
33
  Command-line interface:
33
34
 
34
35
  ```shell script
35
- $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ # Returns a collection of relevant tweets matching a specified query.
37
+ $ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
38
+ --limit 100 --threads 10 --output tweets.json
39
+ ```
40
+
41
+ ```shell script
42
+ # Returns a collection of the most recent tweets posted by the user indicated by the screen_name
43
+ $ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
37
44
  ```
38
45
 
39
46
  From Within Ruby:
40
47
 
41
48
  ```ruby
42
49
  require 'twitterscraper'
50
+ client = Twitterscraper::Client.new(cache: true, proxy: true)
51
+ ```
52
+
53
+ ```ruby
54
+ # Returns a collection of relevant tweets matching a specified query.
55
+ tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
56
+ ```
43
57
 
44
- options = {
45
- start_date: '2020-06-01',
46
- end_date: '2020-06-30',
47
- lang: 'ja',
48
- limit: 100,
49
- threads: 10,
50
- proxy: true
51
- }
58
+ ```ruby
59
+ # Returns a collection of the most recent tweets posted by the user indicated by the screen_name
60
+ tweets = client.user_timeline(SCREEN_NAME, limit: 100)
61
+ ```
52
62
 
53
- client = Twitterscraper::Client.new
54
- tweets = client.query_tweets(KEYWORD, options)
55
63
 
64
+ ## Attributes
65
+
66
+ ### Tweet
67
+
68
+ ```ruby
56
69
  tweets.each do |tweet|
57
70
  puts tweet.tweet_id
58
71
  puts tweet.text
@@ -64,11 +77,6 @@ tweets.each do |tweet|
64
77
  end
65
78
  ```
66
79
 
67
-
68
- ## Attributes
69
-
70
- ### Tweet
71
-
72
80
  - screen_name
73
81
  - name
74
82
  - user_id
@@ -136,15 +144,19 @@ $ cat tweets.json | jq . | less
136
144
  | Option | Description | Default |
137
145
  | ------------- | ------------- | ------------- |
138
146
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
147
+ | `--type` | Specify a search type. | search |
139
148
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
149
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
150
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
151
  | `--lang` | Retrieve tweets written in a specific language. | |
143
152
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
153
+ | `--order` | Sort order of the results. | desc |
144
154
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
- | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
155
+ | `--proxy` | Scrape https://twitter.com/search via proxies. | true |
156
+ | `--cache` | Enable caching. | true |
146
157
  | `--format` | The format of the output. | json |
147
158
  | `--output` | The name of the output file. | tweets.json |
159
+ | `--verbose` | Print debug messages. | tweets.json |
148
160
 
149
161
 
150
162
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,6 +2,7 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -16,14 +16,16 @@ module Twitterscraper
16
16
  print_version || return if print_version?
17
17
 
18
18
  query_options = {
19
+ type: options['type'],
19
20
  start_date: options['start_date'],
20
21
  end_date: options['end_date'],
21
22
  lang: options['lang'],
22
23
  limit: options['limit'],
24
+ daily_limit: options['daily_limit'],
25
+ order: options['order'],
23
26
  threads: options['threads'],
24
- proxy: options['proxy']
25
27
  }
26
- client = Twitterscraper::Client.new
28
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
27
29
  tweets = client.query_tweets(options['query'], query_options)
28
30
  export(tweets) unless tweets.empty?
29
31
  end
@@ -58,25 +60,36 @@ module Twitterscraper
58
60
  'help',
59
61
  'v',
60
62
  'version',
63
+ 'type:',
61
64
  'query:',
62
65
  'start_date:',
63
66
  'end_date:',
64
67
  'lang:',
65
68
  'limit:',
69
+ 'daily_limit:',
70
+ 'order:',
66
71
  'threads:',
67
72
  'output:',
68
73
  'format:',
69
- 'proxy',
74
+ 'cache:',
75
+ 'proxy:',
70
76
  'pretty',
71
77
  'verbose',
72
78
  )
73
79
 
80
+ options['type'] ||= 'search'
81
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
74
82
  options['lang'] ||= ''
75
83
  options['limit'] = (options['limit'] || 100).to_i
84
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
76
85
  options['threads'] = (options['threads'] || 2).to_i
77
86
  options['format'] ||= 'json'
87
+ options['order'] ||= 'desc'
78
88
  options['output'] ||= "tweets.#{options['format']}"
79
89
 
90
+ options['cache'] = options['cache'] != 'false'
91
+ options['proxy'] = options['proxy'] != 'false'
92
+
80
93
  options
81
94
  end
82
95
 
@@ -100,7 +113,7 @@ module Twitterscraper
100
113
  end
101
114
 
102
115
  def print_version
103
- puts "twitterscraper-#{Twitterscraper::VERSION}"
116
+ puts "twitterscraper-#{VERSION}"
104
117
  end
105
118
  end
106
119
  end
@@ -1,5 +1,18 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
4
17
  end
5
18
  end
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -22,36 +22,41 @@ module Twitterscraper
22
22
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
23
  'default&include_available_features=1&include_entities=1&' +
24
24
  'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
25
- INIT_URL_USER = 'https://twitter.com/{u}'
26
- RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/{u}/timeline/tweets?' +
25
+ INIT_URL_USER = 'https://twitter.com/__USER__'
26
+ RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
27
27
  'include_available_features=1&include_entities=1&' +
28
- 'max_position={pos}&reset_error_state=false'
29
-
30
- def build_query_url(query, lang, pos, from_user = false)
31
- # if from_user
32
- # if !pos
33
- # INIT_URL_USER.format(u = query)
34
- # else
35
- # RELOAD_URL_USER.format(u = query, pos = pos)
36
- # end
37
- # end
38
- if pos
39
- RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
28
+ 'max_position=__POS__&reset_error_state=false'
29
+
30
+ def build_query_url(query, lang, from_user, pos)
31
+ if from_user
32
+ if pos
33
+ RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
+ else
35
+ INIT_URL_USER.sub('__USER__', query)
36
+ end
40
37
  else
41
- INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
38
+ if pos
39
+ RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
40
+ else
41
+ INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
42
+ end
42
43
  end
43
44
  end
44
45
 
45
46
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
47
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
48
+ unless proxies.empty?
49
+ proxy = proxies.sample
50
+ logger.info("Using proxy #{proxy}")
51
+ end
52
+ Http.get(url, headers, proxy, timeout)
48
53
  rescue => e
49
54
  logger.debug "query_single_page: #{e.inspect}"
50
55
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
56
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
57
  retry
53
58
  else
54
- raise
59
+ raise Error.new("#{e.inspect} url=#{url}")
55
60
  end
56
61
  end
57
62
 
@@ -70,15 +75,28 @@ module Twitterscraper
70
75
  [items_html, json_resp]
71
76
  end
72
77
 
73
- def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ def query_single_page(query, lang, type, pos, headers: [], proxies: [])
79
+ logger.info "Querying #{query}"
75
80
  query = ERB::Util.url_encode(query)
76
81
 
77
- url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
82
+ url = build_query_url(query, lang, type == 'user', pos)
83
+ http_request = lambda do
84
+ logger.debug "Scraping tweets from #{url}"
85
+ get_single_page(url, headers, proxies)
86
+ end
79
87
 
80
- response = get_single_page(url, headers, proxies)
81
- return [], nil if response.nil?
88
+ if cache_enabled?
89
+ client = Cache.new
90
+ if (response = client.read(url))
91
+ logger.debug 'Fetching tweets from cache'
92
+ else
93
+ response = http_request.call
94
+ client.write(url, response) unless stop_requested?
95
+ end
96
+ else
97
+ response = http_request.call
98
+ end
99
+ return [], nil if response.nil? || response.empty?
82
100
 
83
101
  html, json_resp = parse_single_page(response, pos.nil?)
84
102
 
@@ -90,8 +108,8 @@ module Twitterscraper
90
108
 
91
109
  if json_resp
92
110
  [tweets, json_resp['min_position']]
93
- elsif from_user
94
- raise NotImplementedError
111
+ elsif type
112
+ [tweets, tweets[-1].tweet_id]
95
113
  else
96
114
  [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
97
115
  end
@@ -99,33 +117,34 @@ module Twitterscraper
99
117
 
100
118
  OLDEST_DATE = Date.parse('2006-03-21')
101
119
 
102
- def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
120
+ def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
121
+ query = queries[0]
103
122
  if query.nil? || query == ''
104
- raise 'Please specify a search query.'
123
+ raise Error.new('Please specify a search query.')
105
124
  end
106
125
 
107
126
  if ERB::Util.url_encode(query).length >= 500
108
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
127
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
109
128
  end
110
129
 
111
130
  if start_date && end_date
112
131
  if start_date == end_date
113
- raise 'Please specify different values for :start_date and :end_date.'
132
+ raise Error.new('Please specify different values for :start_date and :end_date.')
114
133
  elsif start_date > end_date
115
- raise ':start_date must occur before :end_date.'
134
+ raise Error.new(':start_date must occur before :end_date.')
116
135
  end
117
136
  end
118
137
 
119
138
  if start_date
120
139
  if start_date < OLDEST_DATE
121
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
140
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
122
141
  end
123
142
  end
124
143
 
125
144
  if end_date
126
145
  today = Date.today
127
146
  if end_date > Date.today
128
- raise ":end_date must be less than or equal to today(#{today})"
147
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
129
148
  end
130
149
  end
131
150
  end
@@ -143,27 +162,32 @@ module Twitterscraper
143
162
  end
144
163
  end
145
164
 
146
- def main_loop(query, lang, limit, headers, proxies)
165
+ def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
147
166
  pos = nil
167
+ daily_tweets = []
148
168
 
149
169
  while true
150
- new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
170
+ new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
151
171
  unless new_tweets.empty?
172
+ daily_tweets.concat(new_tweets)
173
+ daily_tweets.uniq! { |t| t.tweet_id }
174
+
152
175
  @mutex.synchronize {
153
176
  @all_tweets.concat(new_tweets)
154
177
  @all_tweets.uniq! { |t| t.tweet_id }
155
178
  }
156
179
  end
157
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
180
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
158
181
 
159
182
  break unless new_pos
183
+ break if daily_limit && daily_tweets.size >= daily_limit
160
184
  break if @all_tweets.size >= limit
161
185
 
162
186
  pos = new_pos
163
187
  end
164
188
 
165
- if @all_tweets.size >= limit
166
- logger.info("Limit reached #{@all_tweets.size}")
189
+ if !@stop_requested && @all_tweets.size >= limit
190
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
167
191
  @stop_requested = true
168
192
  end
169
193
  end
@@ -172,37 +196,59 @@ module Twitterscraper
172
196
  @stop_requested
173
197
  end
174
198
 
175
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
199
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
176
200
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
177
201
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
178
202
  queries = build_queries(query, start_date, end_date)
179
- threads = queries.size if threads > queries.size
180
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
203
+ if threads > queries.size
204
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
+ threads = queries.size
206
+ end
207
+ if proxy_enabled?
208
+ proxies = Proxy::Pool.new
209
+ logger.debug "Fetch #{proxies.size} proxies"
210
+ else
211
+ proxies = []
212
+ logger.debug 'Proxy disabled'
213
+ end
214
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
181
215
 
182
- validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
183
216
 
184
- logger.info("The number of threads #{threads}")
217
+ validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
+
219
+ logger.info "The number of threads #{threads}"
185
220
 
186
221
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
187
- logger.info("Headers #{headers}")
222
+ logger.info "Headers #{headers}"
188
223
 
189
224
  @all_tweets = []
190
225
  @mutex = Mutex.new
191
226
  @stop_requested = false
192
227
 
193
228
  if threads > 1
229
+ Thread.abort_on_exception = true
230
+ logger.debug "Set 'Thread.abort_on_exception' to true"
231
+
194
232
  Parallel.each(queries, in_threads: threads) do |query|
195
- main_loop(query, lang, limit, headers, proxies)
233
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
196
234
  raise Parallel::Break if stop_requested?
197
235
  end
198
236
  else
199
237
  queries.each do |query|
200
- main_loop(query, lang, limit, headers, proxies)
238
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
201
239
  break if stop_requested?
202
240
  end
203
241
  end
204
242
 
205
- @all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
243
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
+ end
245
+
246
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
248
+ end
249
+
250
+ def user_timeline(screen_name, limit: 100, order: 'desc')
251
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
206
252
  end
207
253
  end
208
254
  end
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -51,12 +59,19 @@ module Twitterscraper
51
59
  def from_tweets_html(html)
52
60
  html.map do |tweet|
53
61
  from_tweet_html(tweet)
54
- end
62
+ end.compact
55
63
  end
56
64
 
57
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
58
74
  inner_html = Nokogiri::HTML(html.inner_html)
59
- tweet_id = html.attr('data-tweet-id').to_i
60
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
61
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
62
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -81,7 +96,7 @@ module Twitterscraper
81
96
 
82
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
83
98
  new(
84
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
85
100
  name: html.attr('data-name'),
86
101
  user_id: html.attr('data-user-id').to_i,
87
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.10.0'
2
+ VERSION = '0.15.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -46,8 +46,10 @@ executables:
46
46
  extensions: []
47
47
  extra_rdoc_files: []
48
48
  files:
49
+ - ".circleci/config.yml"
49
50
  - ".gitignore"
50
51
  - ".irbrc"
52
+ - ".rspec"
51
53
  - ".ruby-version"
52
54
  - ".travis.yml"
53
55
  - CODE_OF_CONDUCT.md
@@ -61,6 +63,7 @@ files:
61
63
  - bin/twitterscraper
62
64
  - lib/twitterscraper-ruby.rb
63
65
  - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
64
67
  - lib/twitterscraper/cli.rb
65
68
  - lib/twitterscraper/client.rb
66
69
  - lib/twitterscraper/http.rb