twitterscraper-ruby 0.11.0 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +31 -0
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +87 -56
- data/lib/twitterscraper/cli.rb +17 -5
- data/lib/twitterscraper/client.rb +6 -1
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +82 -49
- data/lib/twitterscraper/tweet.rb +10 -3
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f04cb0ba394884918271b5485b596c07203b7a6e9f4fec42d074ef4f02b6a0a
|
4
|
+
data.tar.gz: a4f618df53d1e8b54954619e87d383e43dbe5a63bbf83b33ee38f975998f2678
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa9f02cf3ef0bf280f45b18ebacaec0b06dbd610477355602fcc59d382b5590c990695297e1e793457fdcff4cb7dd037f076c1f0fa4706eb69c67c3a165243e4
|
7
|
+
data.tar.gz: 9c08d9e4d1ee56fa133675bc73a50f502040cc9a2844d9a46a39c38ccdffdf43c15b17c2e4a8b74561f523493ccbc4a055f0add239574d2f5129ee4abe1f5ed9
|
@@ -0,0 +1,31 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@0.1.2
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6.4-stretch-node
|
9
|
+
environment:
|
10
|
+
BUNDLER_VERSION: 2.1.4
|
11
|
+
executor: ruby/default
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
- run:
|
15
|
+
name: Update bundler
|
16
|
+
command: gem update bundler
|
17
|
+
- run:
|
18
|
+
name: Which bundler?
|
19
|
+
command: bundle -v
|
20
|
+
- restore_cache:
|
21
|
+
keys:
|
22
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
23
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}
|
24
|
+
- gem-cache-v1
|
25
|
+
- run: bundle install --path vendor/bundle
|
26
|
+
- run: bundle clean
|
27
|
+
- save_cache:
|
28
|
+
key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
29
|
+
paths:
|
30
|
+
- vendor/bundle
|
31
|
+
- run: bundle exec rspec
|
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.15.1)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,18 +1,21 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
+
[![Build Status](https://circleci.com/gh/ts-3156/twitterscraper-ruby.svg?style=svg)](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
|
3
4
|
[![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
5
|
|
5
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
6
7
|
|
8
|
+
Please feel free to ask [@ts_3156](https://twitter.com/ts_3156) if you have any questions.
|
9
|
+
|
7
10
|
|
8
11
|
## Twitter Search API vs. twitterscraper-ruby
|
9
12
|
|
10
|
-
|
13
|
+
#### Twitter Search API
|
11
14
|
|
12
15
|
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
13
16
|
- The time window: the past 7 days
|
14
17
|
|
15
|
-
|
18
|
+
#### twitterscraper-ruby
|
16
19
|
|
17
20
|
- The number of tweets: Unlimited
|
18
21
|
- The time window: from 2006-3-21 to today
|
@@ -29,45 +32,92 @@ $ gem install twitterscraper-ruby
|
|
29
32
|
|
30
33
|
## Usage
|
31
34
|
|
32
|
-
Command-line interface:
|
35
|
+
#### Command-line interface:
|
36
|
+
|
37
|
+
Returns a collection of relevant tweets matching a specified query.
|
33
38
|
|
34
39
|
```shell script
|
35
|
-
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --
|
40
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
41
|
+
--limit 100 --threads 10 --output tweets.json
|
37
42
|
```
|
38
43
|
|
39
|
-
|
44
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
45
|
+
|
46
|
+
```shell script
|
47
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
48
|
+
```
|
49
|
+
|
50
|
+
#### From Within Ruby:
|
40
51
|
|
41
52
|
```ruby
|
42
53
|
require 'twitterscraper'
|
54
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
55
|
+
```
|
43
56
|
|
44
|
-
|
45
|
-
start_date: '2020-06-01',
|
46
|
-
end_date: '2020-06-30',
|
47
|
-
lang: 'ja',
|
48
|
-
limit: 100,
|
49
|
-
threads: 10,
|
50
|
-
proxy: true
|
51
|
-
}
|
57
|
+
Returns a collection of relevant tweets matching a specified query.
|
52
58
|
|
53
|
-
|
54
|
-
tweets = client.
|
59
|
+
```ruby
|
60
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
61
|
+
```
|
62
|
+
|
63
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
67
|
+
```
|
55
68
|
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
72
|
+
```shell script
|
73
|
+
$ twitterscraper --query twitter --limit 1000
|
74
|
+
$ cat tweets.json | jq . | less
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Attributes
|
79
|
+
|
80
|
+
### Tweet
|
81
|
+
|
82
|
+
```ruby
|
56
83
|
tweets.each do |tweet|
|
57
84
|
puts tweet.tweet_id
|
58
85
|
puts tweet.text
|
59
86
|
puts tweet.tweet_url
|
60
87
|
puts tweet.created_at
|
61
88
|
|
89
|
+
attr_names = hash.keys
|
62
90
|
hash = tweet.attrs
|
63
|
-
|
91
|
+
json = tweet.to_json
|
64
92
|
end
|
65
93
|
```
|
66
94
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
95
|
+
```json
|
96
|
+
[
|
97
|
+
{
|
98
|
+
"screen_name": "@name",
|
99
|
+
"name": "Name",
|
100
|
+
"user_id": 12340000,
|
101
|
+
"tweet_id": 1234000000000000,
|
102
|
+
"text": "Thanks Twitter!",
|
103
|
+
"links": [],
|
104
|
+
"hashtags": [],
|
105
|
+
"image_urls": [],
|
106
|
+
"video_url": null,
|
107
|
+
"has_media": null,
|
108
|
+
"likes": 10,
|
109
|
+
"retweets": 20,
|
110
|
+
"replies": 0,
|
111
|
+
"is_replied": false,
|
112
|
+
"is_reply_to": false,
|
113
|
+
"parent_tweet_id": null,
|
114
|
+
"reply_to_users": [],
|
115
|
+
"tweet_url": "https://twitter.com/name/status/1234000000000000",
|
116
|
+
"timestamp": 1594793000,
|
117
|
+
"created_at": "2020-07-15 00:00:00 +0000"
|
118
|
+
}
|
119
|
+
]
|
120
|
+
```
|
71
121
|
|
72
122
|
- screen_name
|
73
123
|
- name
|
@@ -110,43 +160,24 @@ end
|
|
110
160
|
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
161
|
|
112
162
|
|
113
|
-
## Examples
|
114
|
-
|
115
|
-
```shell script
|
116
|
-
$ twitterscraper --query twitter --limit 1000
|
117
|
-
$ cat tweets.json | jq . | less
|
118
|
-
```
|
119
|
-
|
120
|
-
```json
|
121
|
-
[
|
122
|
-
{
|
123
|
-
"screen_name": "@screenname",
|
124
|
-
"name": "name",
|
125
|
-
"user_id": 1194529546483000000,
|
126
|
-
"tweet_id": 1282659891992000000,
|
127
|
-
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
128
|
-
"created_at": "2020-07-13 12:00:00 +0000",
|
129
|
-
"text": "Thanks Twitter!"
|
130
|
-
}
|
131
|
-
]
|
132
|
-
```
|
133
|
-
|
134
163
|
## CLI Options
|
135
164
|
|
136
|
-
| Option | Description |
|
137
|
-
| ------------- | ------------- | ------------- |
|
138
|
-
|
|
139
|
-
| `--
|
140
|
-
| `--
|
141
|
-
| `--
|
142
|
-
| `--
|
143
|
-
| `--
|
144
|
-
| `--
|
145
|
-
| `--
|
146
|
-
| `--
|
147
|
-
| `--
|
148
|
-
| `--
|
149
|
-
| `--
|
165
|
+
| Option | Type | Description | Value |
|
166
|
+
| ------------- | ------------- | ------------- | ------------- |
|
167
|
+
| `--help` | string | This option displays a summary of twitterscraper. | |
|
168
|
+
| `--type` | string | Specify a search type. | search(default) or user |
|
169
|
+
| `--query` | string | Specify a keyword used during the search. | |
|
170
|
+
| `--start_date` | string | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
171
|
+
| `--end_date` | string | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
172
|
+
| `--lang` | string | Retrieve tweets written in a specific language. | |
|
173
|
+
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
|
+
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
|
+
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
176
|
+
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
|
+
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
|
+
| `--format` | string | The format of the output. | json(default) or html |
|
179
|
+
| `--output` | string | The name of the output file. | tweets.json |
|
180
|
+
| `--verbose` | | Print debug messages. | |
|
150
181
|
|
151
182
|
|
152
183
|
## Contributing
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,14 +16,16 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
22
23
|
limit: options['limit'],
|
24
|
+
daily_limit: options['daily_limit'],
|
25
|
+
order: options['order'],
|
23
26
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
27
|
}
|
26
|
-
client = Twitterscraper::Client.new(cache: options['cache'])
|
28
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
29
|
tweets = client.query_tweets(options['query'], query_options)
|
28
30
|
export(tweets) unless tweets.empty?
|
29
31
|
end
|
@@ -58,26 +60,36 @@ module Twitterscraper
|
|
58
60
|
'help',
|
59
61
|
'v',
|
60
62
|
'version',
|
63
|
+
'type:',
|
61
64
|
'query:',
|
62
65
|
'start_date:',
|
63
66
|
'end_date:',
|
64
67
|
'lang:',
|
65
68
|
'limit:',
|
69
|
+
'daily_limit:',
|
70
|
+
'order:',
|
66
71
|
'threads:',
|
67
72
|
'output:',
|
68
73
|
'format:',
|
69
|
-
'cache',
|
70
|
-
'proxy',
|
74
|
+
'cache:',
|
75
|
+
'proxy:',
|
71
76
|
'pretty',
|
72
77
|
'verbose',
|
73
78
|
)
|
74
79
|
|
80
|
+
options['type'] ||= 'search'
|
81
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
75
82
|
options['lang'] ||= ''
|
76
83
|
options['limit'] = (options['limit'] || 100).to_i
|
84
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
77
85
|
options['threads'] = (options['threads'] || 2).to_i
|
78
86
|
options['format'] ||= 'json'
|
87
|
+
options['order'] ||= 'desc'
|
79
88
|
options['output'] ||= "tweets.#{options['format']}"
|
80
89
|
|
90
|
+
options['cache'] = options['cache'] != 'false'
|
91
|
+
options['proxy'] = options['proxy'] != 'false'
|
92
|
+
|
81
93
|
options
|
82
94
|
end
|
83
95
|
|
@@ -101,7 +113,7 @@ module Twitterscraper
|
|
101
113
|
end
|
102
114
|
|
103
115
|
def print_version
|
104
|
-
puts "twitterscraper-#{
|
116
|
+
puts "twitterscraper-#{VERSION}"
|
105
117
|
end
|
106
118
|
end
|
107
119
|
end
|
@@ -2,12 +2,17 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
-
def initialize(cache:)
|
5
|
+
def initialize(cache: true, proxy: true)
|
6
6
|
@cache = cache
|
7
|
+
@proxy = proxy
|
7
8
|
end
|
8
9
|
|
9
10
|
def cache_enabled?
|
10
11
|
@cache
|
11
12
|
end
|
13
|
+
|
14
|
+
def proxy_enabled?
|
15
|
+
@proxy
|
16
|
+
end
|
12
17
|
end
|
13
18
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,36 +22,41 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, type, pos)
|
31
|
+
if type == 'user'
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
45
46
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
47
|
return nil if stop_requested?
|
47
|
-
|
48
|
+
unless proxies.empty?
|
49
|
+
proxy = proxies.sample
|
50
|
+
logger.info("Using proxy #{proxy}")
|
51
|
+
end
|
52
|
+
Http.get(url, headers, proxy, timeout)
|
48
53
|
rescue => e
|
49
|
-
logger.debug "
|
54
|
+
logger.debug "get_single_page: #{e.inspect}"
|
50
55
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
56
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
57
|
retry
|
53
58
|
else
|
54
|
-
raise
|
59
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
60
|
end
|
56
61
|
end
|
57
62
|
|
@@ -70,28 +75,28 @@ module Twitterscraper
|
|
70
75
|
[items_html, json_resp]
|
71
76
|
end
|
72
77
|
|
73
|
-
def query_single_page(query, lang,
|
74
|
-
logger.info
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
79
|
+
logger.info "Querying #{query}"
|
75
80
|
query = ERB::Util.url_encode(query)
|
76
81
|
|
77
|
-
url = build_query_url(query, lang,
|
82
|
+
url = build_query_url(query, lang, type, pos)
|
78
83
|
http_request = lambda do
|
79
|
-
logger.debug
|
84
|
+
logger.debug "Scraping tweets from #{url}"
|
80
85
|
get_single_page(url, headers, proxies)
|
81
86
|
end
|
82
87
|
|
83
88
|
if cache_enabled?
|
84
89
|
client = Cache.new
|
85
90
|
if (response = client.read(url))
|
86
|
-
logger.debug
|
91
|
+
logger.debug 'Fetching tweets from cache'
|
87
92
|
else
|
88
93
|
response = http_request.call
|
89
|
-
client.write(url, response)
|
94
|
+
client.write(url, response) unless stop_requested?
|
90
95
|
end
|
91
96
|
else
|
92
97
|
response = http_request.call
|
93
98
|
end
|
94
|
-
return [], nil if response.nil?
|
99
|
+
return [], nil if response.nil? || response.empty?
|
95
100
|
|
96
101
|
html, json_resp = parse_single_page(response, pos.nil?)
|
97
102
|
|
@@ -103,8 +108,8 @@ module Twitterscraper
|
|
103
108
|
|
104
109
|
if json_resp
|
105
110
|
[tweets, json_resp['min_position']]
|
106
|
-
elsif
|
107
|
-
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
108
113
|
else
|
109
114
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
110
115
|
end
|
@@ -112,33 +117,34 @@ module Twitterscraper
|
|
112
117
|
|
113
118
|
OLDEST_DATE = Date.parse('2006-03-21')
|
114
119
|
|
115
|
-
def validate_options!(
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
121
|
+
query = queries[0]
|
116
122
|
if query.nil? || query == ''
|
117
|
-
raise 'Please specify a search query.'
|
123
|
+
raise Error.new('Please specify a search query.')
|
118
124
|
end
|
119
125
|
|
120
126
|
if ERB::Util.url_encode(query).length >= 500
|
121
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
127
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
122
128
|
end
|
123
129
|
|
124
130
|
if start_date && end_date
|
125
131
|
if start_date == end_date
|
126
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
132
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
127
133
|
elsif start_date > end_date
|
128
|
-
raise ':start_date must occur before :end_date.'
|
134
|
+
raise Error.new(':start_date must occur before :end_date.')
|
129
135
|
end
|
130
136
|
end
|
131
137
|
|
132
138
|
if start_date
|
133
139
|
if start_date < OLDEST_DATE
|
134
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
140
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
135
141
|
end
|
136
142
|
end
|
137
143
|
|
138
144
|
if end_date
|
139
145
|
today = Date.today
|
140
146
|
if end_date > Date.today
|
141
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
147
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
142
148
|
end
|
143
149
|
end
|
144
150
|
end
|
@@ -156,27 +162,32 @@ module Twitterscraper
|
|
156
162
|
end
|
157
163
|
end
|
158
164
|
|
159
|
-
def main_loop(query, lang, limit, headers, proxies)
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
160
166
|
pos = nil
|
167
|
+
daily_tweets = []
|
161
168
|
|
162
169
|
while true
|
163
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
164
171
|
unless new_tweets.empty?
|
172
|
+
daily_tweets.concat(new_tweets)
|
173
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
174
|
+
|
165
175
|
@mutex.synchronize {
|
166
176
|
@all_tweets.concat(new_tweets)
|
167
177
|
@all_tweets.uniq! { |t| t.tweet_id }
|
168
178
|
}
|
169
179
|
end
|
170
|
-
logger.info
|
180
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
171
181
|
|
172
182
|
break unless new_pos
|
183
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
173
184
|
break if @all_tweets.size >= limit
|
174
185
|
|
175
186
|
pos = new_pos
|
176
187
|
end
|
177
188
|
|
178
|
-
if @all_tweets.size >= limit
|
179
|
-
logger.
|
189
|
+
if !@stop_requested && @all_tweets.size >= limit
|
190
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
180
191
|
@stop_requested = true
|
181
192
|
end
|
182
193
|
end
|
@@ -185,37 +196,59 @@ module Twitterscraper
|
|
185
196
|
@stop_requested
|
186
197
|
end
|
187
198
|
|
188
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
189
200
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
190
201
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
191
202
|
queries = build_queries(query, start_date, end_date)
|
192
|
-
|
193
|
-
|
203
|
+
if threads > queries.size
|
204
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
|
+
threads = queries.size
|
206
|
+
end
|
207
|
+
if proxy_enabled?
|
208
|
+
proxies = Proxy::Pool.new
|
209
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
210
|
+
else
|
211
|
+
proxies = []
|
212
|
+
logger.debug 'Proxy disabled'
|
213
|
+
end
|
214
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
194
215
|
|
195
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
196
216
|
|
197
|
-
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
|
+
|
219
|
+
logger.info "The number of threads #{threads}"
|
198
220
|
|
199
221
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
200
|
-
logger.info
|
222
|
+
logger.info "Headers #{headers}"
|
201
223
|
|
202
224
|
@all_tweets = []
|
203
225
|
@mutex = Mutex.new
|
204
226
|
@stop_requested = false
|
205
227
|
|
206
228
|
if threads > 1
|
229
|
+
Thread.abort_on_exception = true
|
230
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
231
|
+
|
207
232
|
Parallel.each(queries, in_threads: threads) do |query|
|
208
|
-
main_loop(query, lang, limit, headers, proxies)
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
209
234
|
raise Parallel::Break if stop_requested?
|
210
235
|
end
|
211
236
|
else
|
212
237
|
queries.each do |query|
|
213
|
-
main_loop(query, lang, limit, headers, proxies)
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
214
239
|
break if stop_requested?
|
215
240
|
end
|
216
241
|
end
|
217
242
|
|
218
|
-
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
243
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
|
+
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
219
252
|
end
|
220
253
|
end
|
221
254
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -59,12 +59,19 @@ module Twitterscraper
|
|
59
59
|
def from_tweets_html(html)
|
60
60
|
html.map do |tweet|
|
61
61
|
from_tweet_html(tweet)
|
62
|
-
end
|
62
|
+
end.compact
|
63
63
|
end
|
64
64
|
|
65
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
66
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
67
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
68
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
69
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
70
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -89,7 +96,7 @@ module Twitterscraper
|
|
89
96
|
|
90
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
91
98
|
new(
|
92
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
93
100
|
name: html.attr('data-name'),
|
94
101
|
user_id: html.attr('data-user-id').to_i,
|
95
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -46,8 +46,10 @@ executables:
|
|
46
46
|
extensions: []
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
+
- ".circleci/config.yml"
|
49
50
|
- ".gitignore"
|
50
51
|
- ".irbrc"
|
52
|
+
- ".rspec"
|
51
53
|
- ".ruby-version"
|
52
54
|
- ".travis.yml"
|
53
55
|
- CODE_OF_CONDUCT.md
|