twitterscraper-ruby 0.11.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +31 -0
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +87 -56
- data/lib/twitterscraper/cli.rb +17 -5
- data/lib/twitterscraper/client.rb +6 -1
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +82 -49
- data/lib/twitterscraper/tweet.rb +10 -3
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f04cb0ba394884918271b5485b596c07203b7a6e9f4fec42d074ef4f02b6a0a
|
4
|
+
data.tar.gz: a4f618df53d1e8b54954619e87d383e43dbe5a63bbf83b33ee38f975998f2678
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa9f02cf3ef0bf280f45b18ebacaec0b06dbd610477355602fcc59d382b5590c990695297e1e793457fdcff4cb7dd037f076c1f0fa4706eb69c67c3a165243e4
|
7
|
+
data.tar.gz: 9c08d9e4d1ee56fa133675bc73a50f502040cc9a2844d9a46a39c38ccdffdf43c15b17c2e4a8b74561f523493ccbc4a055f0add239574d2f5129ee4abe1f5ed9
|
@@ -0,0 +1,31 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@0.1.2
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6.4-stretch-node
|
9
|
+
environment:
|
10
|
+
BUNDLER_VERSION: 2.1.4
|
11
|
+
executor: ruby/default
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
- run:
|
15
|
+
name: Update bundler
|
16
|
+
command: gem update bundler
|
17
|
+
- run:
|
18
|
+
name: Which bundler?
|
19
|
+
command: bundle -v
|
20
|
+
- restore_cache:
|
21
|
+
keys:
|
22
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
23
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}
|
24
|
+
- gem-cache-v1
|
25
|
+
- run: bundle install --path vendor/bundle
|
26
|
+
- run: bundle clean
|
27
|
+
- save_cache:
|
28
|
+
key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
29
|
+
paths:
|
30
|
+
- vendor/bundle
|
31
|
+
- run: bundle exec rspec
|
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.15.1)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,18 +1,21 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
+
[](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
|
3
4
|
[](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
5
|
|
5
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
6
7
|
|
8
|
+
Please feel free to ask [@ts_3156](https://twitter.com/ts_3156) if you have any questions.
|
9
|
+
|
7
10
|
|
8
11
|
## Twitter Search API vs. twitterscraper-ruby
|
9
12
|
|
10
|
-
|
13
|
+
#### Twitter Search API
|
11
14
|
|
12
15
|
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
13
16
|
- The time window: the past 7 days
|
14
17
|
|
15
|
-
|
18
|
+
#### twitterscraper-ruby
|
16
19
|
|
17
20
|
- The number of tweets: Unlimited
|
18
21
|
- The time window: from 2006-3-21 to today
|
@@ -29,45 +32,92 @@ $ gem install twitterscraper-ruby
|
|
29
32
|
|
30
33
|
## Usage
|
31
34
|
|
32
|
-
Command-line interface:
|
35
|
+
#### Command-line interface:
|
36
|
+
|
37
|
+
Returns a collection of relevant tweets matching a specified query.
|
33
38
|
|
34
39
|
```shell script
|
35
|
-
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --
|
40
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
41
|
+
--limit 100 --threads 10 --output tweets.json
|
37
42
|
```
|
38
43
|
|
39
|
-
|
44
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
45
|
+
|
46
|
+
```shell script
|
47
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
48
|
+
```
|
49
|
+
|
50
|
+
#### From Within Ruby:
|
40
51
|
|
41
52
|
```ruby
|
42
53
|
require 'twitterscraper'
|
54
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
55
|
+
```
|
43
56
|
|
44
|
-
|
45
|
-
start_date: '2020-06-01',
|
46
|
-
end_date: '2020-06-30',
|
47
|
-
lang: 'ja',
|
48
|
-
limit: 100,
|
49
|
-
threads: 10,
|
50
|
-
proxy: true
|
51
|
-
}
|
57
|
+
Returns a collection of relevant tweets matching a specified query.
|
52
58
|
|
53
|
-
|
54
|
-
tweets = client.
|
59
|
+
```ruby
|
60
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
61
|
+
```
|
62
|
+
|
63
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
67
|
+
```
|
55
68
|
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
72
|
+
```shell script
|
73
|
+
$ twitterscraper --query twitter --limit 1000
|
74
|
+
$ cat tweets.json | jq . | less
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Attributes
|
79
|
+
|
80
|
+
### Tweet
|
81
|
+
|
82
|
+
```ruby
|
56
83
|
tweets.each do |tweet|
|
57
84
|
puts tweet.tweet_id
|
58
85
|
puts tweet.text
|
59
86
|
puts tweet.tweet_url
|
60
87
|
puts tweet.created_at
|
61
88
|
|
89
|
+
attr_names = hash.keys
|
62
90
|
hash = tweet.attrs
|
63
|
-
|
91
|
+
json = tweet.to_json
|
64
92
|
end
|
65
93
|
```
|
66
94
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
95
|
+
```json
|
96
|
+
[
|
97
|
+
{
|
98
|
+
"screen_name": "@name",
|
99
|
+
"name": "Name",
|
100
|
+
"user_id": 12340000,
|
101
|
+
"tweet_id": 1234000000000000,
|
102
|
+
"text": "Thanks Twitter!",
|
103
|
+
"links": [],
|
104
|
+
"hashtags": [],
|
105
|
+
"image_urls": [],
|
106
|
+
"video_url": null,
|
107
|
+
"has_media": null,
|
108
|
+
"likes": 10,
|
109
|
+
"retweets": 20,
|
110
|
+
"replies": 0,
|
111
|
+
"is_replied": false,
|
112
|
+
"is_reply_to": false,
|
113
|
+
"parent_tweet_id": null,
|
114
|
+
"reply_to_users": [],
|
115
|
+
"tweet_url": "https://twitter.com/name/status/1234000000000000",
|
116
|
+
"timestamp": 1594793000,
|
117
|
+
"created_at": "2020-07-15 00:00:00 +0000"
|
118
|
+
}
|
119
|
+
]
|
120
|
+
```
|
71
121
|
|
72
122
|
- screen_name
|
73
123
|
- name
|
@@ -110,43 +160,24 @@ end
|
|
110
160
|
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
161
|
|
112
162
|
|
113
|
-
## Examples
|
114
|
-
|
115
|
-
```shell script
|
116
|
-
$ twitterscraper --query twitter --limit 1000
|
117
|
-
$ cat tweets.json | jq . | less
|
118
|
-
```
|
119
|
-
|
120
|
-
```json
|
121
|
-
[
|
122
|
-
{
|
123
|
-
"screen_name": "@screenname",
|
124
|
-
"name": "name",
|
125
|
-
"user_id": 1194529546483000000,
|
126
|
-
"tweet_id": 1282659891992000000,
|
127
|
-
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
128
|
-
"created_at": "2020-07-13 12:00:00 +0000",
|
129
|
-
"text": "Thanks Twitter!"
|
130
|
-
}
|
131
|
-
]
|
132
|
-
```
|
133
|
-
|
134
163
|
## CLI Options
|
135
164
|
|
136
|
-
| Option | Description |
|
137
|
-
| ------------- | ------------- | ------------- |
|
138
|
-
|
|
139
|
-
| `--
|
140
|
-
| `--
|
141
|
-
| `--
|
142
|
-
| `--
|
143
|
-
| `--
|
144
|
-
| `--
|
145
|
-
| `--
|
146
|
-
| `--
|
147
|
-
| `--
|
148
|
-
| `--
|
149
|
-
| `--
|
165
|
+
| Option | Type | Description | Value |
|
166
|
+
| ------------- | ------------- | ------------- | ------------- |
|
167
|
+
| `--help` | string | This option displays a summary of twitterscraper. | |
|
168
|
+
| `--type` | string | Specify a search type. | search(default) or user |
|
169
|
+
| `--query` | string | Specify a keyword used during the search. | |
|
170
|
+
| `--start_date` | string | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
171
|
+
| `--end_date` | string | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
172
|
+
| `--lang` | string | Retrieve tweets written in a specific language. | |
|
173
|
+
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
|
+
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
|
+
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
176
|
+
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
|
+
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
|
+
| `--format` | string | The format of the output. | json(default) or html |
|
179
|
+
| `--output` | string | The name of the output file. | tweets.json |
|
180
|
+
| `--verbose` | | Print debug messages. | |
|
150
181
|
|
151
182
|
|
152
183
|
## Contributing
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,14 +16,16 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
22
23
|
limit: options['limit'],
|
24
|
+
daily_limit: options['daily_limit'],
|
25
|
+
order: options['order'],
|
23
26
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
27
|
}
|
26
|
-
client = Twitterscraper::Client.new(cache: options['cache'])
|
28
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
29
|
tweets = client.query_tweets(options['query'], query_options)
|
28
30
|
export(tweets) unless tweets.empty?
|
29
31
|
end
|
@@ -58,26 +60,36 @@ module Twitterscraper
|
|
58
60
|
'help',
|
59
61
|
'v',
|
60
62
|
'version',
|
63
|
+
'type:',
|
61
64
|
'query:',
|
62
65
|
'start_date:',
|
63
66
|
'end_date:',
|
64
67
|
'lang:',
|
65
68
|
'limit:',
|
69
|
+
'daily_limit:',
|
70
|
+
'order:',
|
66
71
|
'threads:',
|
67
72
|
'output:',
|
68
73
|
'format:',
|
69
|
-
'cache',
|
70
|
-
'proxy',
|
74
|
+
'cache:',
|
75
|
+
'proxy:',
|
71
76
|
'pretty',
|
72
77
|
'verbose',
|
73
78
|
)
|
74
79
|
|
80
|
+
options['type'] ||= 'search'
|
81
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
75
82
|
options['lang'] ||= ''
|
76
83
|
options['limit'] = (options['limit'] || 100).to_i
|
84
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
77
85
|
options['threads'] = (options['threads'] || 2).to_i
|
78
86
|
options['format'] ||= 'json'
|
87
|
+
options['order'] ||= 'desc'
|
79
88
|
options['output'] ||= "tweets.#{options['format']}"
|
80
89
|
|
90
|
+
options['cache'] = options['cache'] != 'false'
|
91
|
+
options['proxy'] = options['proxy'] != 'false'
|
92
|
+
|
81
93
|
options
|
82
94
|
end
|
83
95
|
|
@@ -101,7 +113,7 @@ module Twitterscraper
|
|
101
113
|
end
|
102
114
|
|
103
115
|
def print_version
|
104
|
-
puts "twitterscraper-#{
|
116
|
+
puts "twitterscraper-#{VERSION}"
|
105
117
|
end
|
106
118
|
end
|
107
119
|
end
|
@@ -2,12 +2,17 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
-
def initialize(cache:)
|
5
|
+
def initialize(cache: true, proxy: true)
|
6
6
|
@cache = cache
|
7
|
+
@proxy = proxy
|
7
8
|
end
|
8
9
|
|
9
10
|
def cache_enabled?
|
10
11
|
@cache
|
11
12
|
end
|
13
|
+
|
14
|
+
def proxy_enabled?
|
15
|
+
@proxy
|
16
|
+
end
|
12
17
|
end
|
13
18
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,36 +22,41 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, type, pos)
|
31
|
+
if type == 'user'
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
45
46
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
47
|
return nil if stop_requested?
|
47
|
-
|
48
|
+
unless proxies.empty?
|
49
|
+
proxy = proxies.sample
|
50
|
+
logger.info("Using proxy #{proxy}")
|
51
|
+
end
|
52
|
+
Http.get(url, headers, proxy, timeout)
|
48
53
|
rescue => e
|
49
|
-
logger.debug "
|
54
|
+
logger.debug "get_single_page: #{e.inspect}"
|
50
55
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
56
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
57
|
retry
|
53
58
|
else
|
54
|
-
raise
|
59
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
60
|
end
|
56
61
|
end
|
57
62
|
|
@@ -70,28 +75,28 @@ module Twitterscraper
|
|
70
75
|
[items_html, json_resp]
|
71
76
|
end
|
72
77
|
|
73
|
-
def query_single_page(query, lang,
|
74
|
-
logger.info
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
79
|
+
logger.info "Querying #{query}"
|
75
80
|
query = ERB::Util.url_encode(query)
|
76
81
|
|
77
|
-
url = build_query_url(query, lang,
|
82
|
+
url = build_query_url(query, lang, type, pos)
|
78
83
|
http_request = lambda do
|
79
|
-
logger.debug
|
84
|
+
logger.debug "Scraping tweets from #{url}"
|
80
85
|
get_single_page(url, headers, proxies)
|
81
86
|
end
|
82
87
|
|
83
88
|
if cache_enabled?
|
84
89
|
client = Cache.new
|
85
90
|
if (response = client.read(url))
|
86
|
-
logger.debug
|
91
|
+
logger.debug 'Fetching tweets from cache'
|
87
92
|
else
|
88
93
|
response = http_request.call
|
89
|
-
client.write(url, response)
|
94
|
+
client.write(url, response) unless stop_requested?
|
90
95
|
end
|
91
96
|
else
|
92
97
|
response = http_request.call
|
93
98
|
end
|
94
|
-
return [], nil if response.nil?
|
99
|
+
return [], nil if response.nil? || response.empty?
|
95
100
|
|
96
101
|
html, json_resp = parse_single_page(response, pos.nil?)
|
97
102
|
|
@@ -103,8 +108,8 @@ module Twitterscraper
|
|
103
108
|
|
104
109
|
if json_resp
|
105
110
|
[tweets, json_resp['min_position']]
|
106
|
-
elsif
|
107
|
-
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
108
113
|
else
|
109
114
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
110
115
|
end
|
@@ -112,33 +117,34 @@ module Twitterscraper
|
|
112
117
|
|
113
118
|
OLDEST_DATE = Date.parse('2006-03-21')
|
114
119
|
|
115
|
-
def validate_options!(
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
121
|
+
query = queries[0]
|
116
122
|
if query.nil? || query == ''
|
117
|
-
raise 'Please specify a search query.'
|
123
|
+
raise Error.new('Please specify a search query.')
|
118
124
|
end
|
119
125
|
|
120
126
|
if ERB::Util.url_encode(query).length >= 500
|
121
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
127
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
122
128
|
end
|
123
129
|
|
124
130
|
if start_date && end_date
|
125
131
|
if start_date == end_date
|
126
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
132
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
127
133
|
elsif start_date > end_date
|
128
|
-
raise ':start_date must occur before :end_date.'
|
134
|
+
raise Error.new(':start_date must occur before :end_date.')
|
129
135
|
end
|
130
136
|
end
|
131
137
|
|
132
138
|
if start_date
|
133
139
|
if start_date < OLDEST_DATE
|
134
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
140
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
135
141
|
end
|
136
142
|
end
|
137
143
|
|
138
144
|
if end_date
|
139
145
|
today = Date.today
|
140
146
|
if end_date > Date.today
|
141
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
147
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
142
148
|
end
|
143
149
|
end
|
144
150
|
end
|
@@ -156,27 +162,32 @@ module Twitterscraper
|
|
156
162
|
end
|
157
163
|
end
|
158
164
|
|
159
|
-
def main_loop(query, lang, limit, headers, proxies)
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
160
166
|
pos = nil
|
167
|
+
daily_tweets = []
|
161
168
|
|
162
169
|
while true
|
163
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
164
171
|
unless new_tweets.empty?
|
172
|
+
daily_tweets.concat(new_tweets)
|
173
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
174
|
+
|
165
175
|
@mutex.synchronize {
|
166
176
|
@all_tweets.concat(new_tweets)
|
167
177
|
@all_tweets.uniq! { |t| t.tweet_id }
|
168
178
|
}
|
169
179
|
end
|
170
|
-
logger.info
|
180
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
171
181
|
|
172
182
|
break unless new_pos
|
183
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
173
184
|
break if @all_tweets.size >= limit
|
174
185
|
|
175
186
|
pos = new_pos
|
176
187
|
end
|
177
188
|
|
178
|
-
if @all_tweets.size >= limit
|
179
|
-
logger.
|
189
|
+
if !@stop_requested && @all_tweets.size >= limit
|
190
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
180
191
|
@stop_requested = true
|
181
192
|
end
|
182
193
|
end
|
@@ -185,37 +196,59 @@ module Twitterscraper
|
|
185
196
|
@stop_requested
|
186
197
|
end
|
187
198
|
|
188
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
189
200
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
190
201
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
191
202
|
queries = build_queries(query, start_date, end_date)
|
192
|
-
|
193
|
-
|
203
|
+
if threads > queries.size
|
204
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
|
+
threads = queries.size
|
206
|
+
end
|
207
|
+
if proxy_enabled?
|
208
|
+
proxies = Proxy::Pool.new
|
209
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
210
|
+
else
|
211
|
+
proxies = []
|
212
|
+
logger.debug 'Proxy disabled'
|
213
|
+
end
|
214
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
194
215
|
|
195
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
196
216
|
|
197
|
-
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
|
+
|
219
|
+
logger.info "The number of threads #{threads}"
|
198
220
|
|
199
221
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
200
|
-
logger.info
|
222
|
+
logger.info "Headers #{headers}"
|
201
223
|
|
202
224
|
@all_tweets = []
|
203
225
|
@mutex = Mutex.new
|
204
226
|
@stop_requested = false
|
205
227
|
|
206
228
|
if threads > 1
|
229
|
+
Thread.abort_on_exception = true
|
230
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
231
|
+
|
207
232
|
Parallel.each(queries, in_threads: threads) do |query|
|
208
|
-
main_loop(query, lang, limit, headers, proxies)
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
209
234
|
raise Parallel::Break if stop_requested?
|
210
235
|
end
|
211
236
|
else
|
212
237
|
queries.each do |query|
|
213
|
-
main_loop(query, lang, limit, headers, proxies)
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
214
239
|
break if stop_requested?
|
215
240
|
end
|
216
241
|
end
|
217
242
|
|
218
|
-
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
243
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
|
+
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
219
252
|
end
|
220
253
|
end
|
221
254
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -59,12 +59,19 @@ module Twitterscraper
|
|
59
59
|
def from_tweets_html(html)
|
60
60
|
html.map do |tweet|
|
61
61
|
from_tweet_html(tweet)
|
62
|
-
end
|
62
|
+
end.compact
|
63
63
|
end
|
64
64
|
|
65
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
66
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
67
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
68
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
69
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
70
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -89,7 +96,7 @@ module Twitterscraper
|
|
89
96
|
|
90
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
91
98
|
new(
|
92
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
93
100
|
name: html.attr('data-name'),
|
94
101
|
user_id: html.attr('data-user-id').to_i,
|
95
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -46,8 +46,10 @@ executables:
|
|
46
46
|
extensions: []
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
+
- ".circleci/config.yml"
|
49
50
|
- ".gitignore"
|
50
51
|
- ".irbrc"
|
52
|
+
- ".rspec"
|
51
53
|
- ".ruby-version"
|
52
54
|
- ".travis.yml"
|
53
55
|
- CODE_OF_CONDUCT.md
|