twitterscraper-ruby 0.14.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +89 -56
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +17 -5
- data/lib/twitterscraper/query.rb +81 -45
- data/lib/twitterscraper/template.rb +23 -41
- data/lib/twitterscraper/template/tweets.html.erb +98 -0
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +15 -0
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ac0c10b18d836983cc6b73e25b9ed333af2f620106a07c6bc6a40058fb127895
|
4
|
+
data.tar.gz: e6fc18219d9127fb30ba57e39dc4656c0f0a3c108428d959de5bac9e7d317088
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90cbf06b606878dc36b4bba44669139c273bf03b08a777ad87036834841bcb4b052e0559813dc56e4be124442abfc5a7fc44c5c9524c74929ca02b1d287d346b
|
7
|
+
data.tar.gz: ada0b74ee42ff62964b73ad9b49358227cdaf4fc87420cf12cf65af95168ad9775615a504345ebc83d3b791e9c0d892691c55bc477eddd647b3e8934f752fb9c
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -5,15 +5,17 @@
|
|
5
5
|
|
6
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
7
7
|
|
8
|
+
Please feel free to ask [@ts_3156](https://twitter.com/ts_3156) if you have any questions.
|
9
|
+
|
8
10
|
|
9
11
|
## Twitter Search API vs. twitterscraper-ruby
|
10
12
|
|
11
|
-
|
13
|
+
#### Twitter Search API
|
12
14
|
|
13
15
|
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
14
16
|
- The time window: the past 7 days
|
15
17
|
|
16
|
-
|
18
|
+
#### twitterscraper-ruby
|
17
19
|
|
18
20
|
- The number of tweets: Unlimited
|
19
21
|
- The time window: from 2006-3-21 to today
|
@@ -30,48 +32,98 @@ $ gem install twitterscraper-ruby
|
|
30
32
|
|
31
33
|
## Usage
|
32
34
|
|
33
|
-
Command-line interface:
|
35
|
+
#### Command-line interface:
|
36
|
+
|
37
|
+
Returns a collection of relevant tweets matching a specified query.
|
34
38
|
|
35
39
|
```shell script
|
36
|
-
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
37
|
-
--limit 100 --threads 10 --output
|
40
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
41
|
+
--limit 100 --threads 10 --output tweets.json
|
38
42
|
```
|
39
43
|
|
40
|
-
|
44
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
45
|
+
|
46
|
+
```shell script
|
47
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
48
|
+
```
|
49
|
+
|
50
|
+
#### From Within Ruby:
|
41
51
|
|
42
52
|
```ruby
|
43
53
|
require 'twitterscraper'
|
54
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
55
|
+
```
|
44
56
|
|
45
|
-
|
46
|
-
start_date: '2020-06-01',
|
47
|
-
end_date: '2020-06-30',
|
48
|
-
lang: 'ja',
|
49
|
-
limit: 100,
|
50
|
-
threads: 10,
|
51
|
-
}
|
57
|
+
Returns a collection of relevant tweets matching a specified query.
|
52
58
|
|
53
|
-
|
54
|
-
tweets = client.
|
59
|
+
```ruby
|
60
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
61
|
+
```
|
62
|
+
|
63
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
67
|
+
```
|
68
|
+
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
72
|
+
```shell script
|
73
|
+
$ twitterscraper --query twitter --limit 1000
|
74
|
+
$ cat tweets.json | jq . | less
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Attributes
|
55
79
|
|
80
|
+
### Tweet
|
81
|
+
|
82
|
+
```ruby
|
56
83
|
tweets.each do |tweet|
|
57
84
|
puts tweet.tweet_id
|
58
85
|
puts tweet.text
|
59
86
|
puts tweet.tweet_url
|
60
87
|
puts tweet.created_at
|
61
88
|
|
89
|
+
attr_names = hash.keys
|
62
90
|
hash = tweet.attrs
|
63
|
-
|
91
|
+
json = tweet.to_json
|
64
92
|
end
|
65
93
|
```
|
66
94
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
95
|
+
```json
|
96
|
+
[
|
97
|
+
{
|
98
|
+
"screen_name": "@name",
|
99
|
+
"name": "Name",
|
100
|
+
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
102
|
+
"tweet_id": 1234000000000000,
|
103
|
+
"text": "Thanks Twitter!",
|
104
|
+
"links": [],
|
105
|
+
"hashtags": [],
|
106
|
+
"image_urls": [],
|
107
|
+
"video_url": null,
|
108
|
+
"has_media": null,
|
109
|
+
"likes": 10,
|
110
|
+
"retweets": 20,
|
111
|
+
"replies": 0,
|
112
|
+
"is_replied": false,
|
113
|
+
"is_reply_to": false,
|
114
|
+
"parent_tweet_id": null,
|
115
|
+
"reply_to_users": [],
|
116
|
+
"tweet_url": "https://twitter.com/name/status/1234000000000000",
|
117
|
+
"timestamp": 1594793000,
|
118
|
+
"created_at": "2020-07-15 00:00:00 +0000"
|
119
|
+
}
|
120
|
+
]
|
121
|
+
```
|
71
122
|
|
72
123
|
- screen_name
|
73
124
|
- name
|
74
125
|
- user_id
|
126
|
+
- profile_image_url
|
75
127
|
- tweet_id
|
76
128
|
- text
|
77
129
|
- links
|
@@ -110,44 +162,25 @@ end
|
|
110
162
|
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
163
|
|
112
164
|
|
113
|
-
## Examples
|
114
|
-
|
115
|
-
```shell script
|
116
|
-
$ twitterscraper --query twitter --limit 1000
|
117
|
-
$ cat tweets.json | jq . | less
|
118
|
-
```
|
119
|
-
|
120
|
-
```json
|
121
|
-
[
|
122
|
-
{
|
123
|
-
"screen_name": "@screenname",
|
124
|
-
"name": "name",
|
125
|
-
"user_id": 1194529546483000000,
|
126
|
-
"tweet_id": 1282659891992000000,
|
127
|
-
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
128
|
-
"created_at": "2020-07-13 12:00:00 +0000",
|
129
|
-
"text": "Thanks Twitter!"
|
130
|
-
}
|
131
|
-
]
|
132
|
-
```
|
133
|
-
|
134
165
|
## CLI Options
|
135
166
|
|
136
|
-
| Option | Description |
|
137
|
-
| ------------- | ------------- | ------------- |
|
138
|
-
|
|
139
|
-
| `--
|
140
|
-
| `--
|
141
|
-
| `--
|
142
|
-
| `--
|
143
|
-
| `--
|
144
|
-
| `--
|
145
|
-
| `--
|
146
|
-
| `--
|
147
|
-
| `--
|
148
|
-
| `--
|
149
|
-
| `--
|
150
|
-
| `--
|
167
|
+
| Option | Type | Description | Value |
|
168
|
+
| ------------- | ------------- | ------------- | ------------- |
|
169
|
+
| `--help` | string | This option displays a summary of twitterscraper. | |
|
170
|
+
| `--type` | string | Specify a search type. | search(default) or user |
|
171
|
+
| `--query` | string | Specify a keyword used during the search. | |
|
172
|
+
| `--start_date` | string | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
173
|
+
| `--end_date` | string | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
174
|
+
| `--lang` | string | Retrieve tweets written in a specific language. | |
|
175
|
+
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
176
|
+
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
177
|
+
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | | auto |
|
179
|
+
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
180
|
+
| `--cache` | boolean | Enable caching. | true(default) or false |
|
181
|
+
| `--format` | string | The format of the output. | json(default) or html |
|
182
|
+
| `--output` | string | The name of the output file. | tweets.json |
|
183
|
+
| `--verbose` | | Print debug messages. | |
|
151
184
|
|
152
185
|
|
153
186
|
## Contributing
|
data/lib/twitterscraper.rb
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 # 1 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def delete(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.delete(file) if File.exist?(file)
|
32
|
+
end
|
33
|
+
|
28
34
|
def fetch(key, &block)
|
29
35
|
if (value = read(key))
|
30
36
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,6 +16,7 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
@@ -23,19 +24,20 @@ module Twitterscraper
|
|
23
24
|
daily_limit: options['daily_limit'],
|
24
25
|
order: options['order'],
|
25
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
26
28
|
}
|
27
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
28
30
|
tweets = client.query_tweets(options['query'], query_options)
|
29
|
-
export(tweets) unless tweets.empty?
|
31
|
+
export(options['query'], tweets) unless tweets.empty?
|
30
32
|
end
|
31
33
|
|
32
|
-
def export(tweets)
|
34
|
+
def export(name, tweets)
|
33
35
|
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
34
36
|
|
35
37
|
if options['format'] == 'json'
|
36
38
|
write_json.call
|
37
39
|
elsif options['format'] == 'html'
|
38
|
-
File.write('
|
40
|
+
File.write(options['output'], Template.new.tweets_embedded_html(name, tweets, options))
|
39
41
|
else
|
40
42
|
write_json.call
|
41
43
|
end
|
@@ -59,6 +61,7 @@ module Twitterscraper
|
|
59
61
|
'help',
|
60
62
|
'v',
|
61
63
|
'version',
|
64
|
+
'type:',
|
62
65
|
'query:',
|
63
66
|
'start_date:',
|
64
67
|
'end_date:',
|
@@ -67,6 +70,7 @@ module Twitterscraper
|
|
67
70
|
'daily_limit:',
|
68
71
|
'order:',
|
69
72
|
'threads:',
|
73
|
+
'threads_granularity:',
|
70
74
|
'output:',
|
71
75
|
'format:',
|
72
76
|
'cache:',
|
@@ -75,14 +79,16 @@ module Twitterscraper
|
|
75
79
|
'verbose',
|
76
80
|
)
|
77
81
|
|
82
|
+
options['type'] ||= 'search'
|
78
83
|
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
79
84
|
options['lang'] ||= ''
|
80
85
|
options['limit'] = (options['limit'] || 100).to_i
|
81
86
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
82
|
-
options['threads'] = (options['threads'] ||
|
87
|
+
options['threads'] = (options['threads'] || 10).to_i
|
88
|
+
options['threads_granularity'] ||= 'auto'
|
83
89
|
options['format'] ||= 'json'
|
84
90
|
options['order'] ||= 'desc'
|
85
|
-
options['output'] ||=
|
91
|
+
options['output'] ||= build_output_name(options)
|
86
92
|
|
87
93
|
options['cache'] = options['cache'] != 'false'
|
88
94
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -90,6 +96,12 @@ module Twitterscraper
|
|
90
96
|
options
|
91
97
|
end
|
92
98
|
|
99
|
+
def build_output_name(options)
|
100
|
+
query = ERB::Util.url_encode(options['query'])
|
101
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
102
|
+
[options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format']
|
103
|
+
end
|
104
|
+
|
93
105
|
def initialize_logger
|
94
106
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
95
107
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,23 +22,24 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, type, pos)
|
31
|
+
if type.user?
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -50,7 +51,7 @@ module Twitterscraper
|
|
50
51
|
end
|
51
52
|
Http.get(url, headers, proxy, timeout)
|
52
53
|
rescue => e
|
53
|
-
logger.debug "
|
54
|
+
logger.debug "get_single_page: #{e.inspect}"
|
54
55
|
if (retries -= 1) > 0
|
55
56
|
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
56
57
|
retry
|
@@ -68,26 +69,25 @@ module Twitterscraper
|
|
68
69
|
else
|
69
70
|
json_resp = JSON.parse(text)
|
70
71
|
items_html = json_resp['items_html'] || ''
|
71
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
72
72
|
end
|
73
73
|
|
74
74
|
[items_html, json_resp]
|
75
75
|
end
|
76
76
|
|
77
|
-
def query_single_page(query, lang,
|
77
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
78
78
|
logger.info "Querying #{query}"
|
79
79
|
query = ERB::Util.url_encode(query)
|
80
80
|
|
81
|
-
url = build_query_url(query, lang,
|
81
|
+
url = build_query_url(query, lang, type, pos)
|
82
82
|
http_request = lambda do
|
83
|
-
logger.debug "Scraping tweets from
|
83
|
+
logger.debug "Scraping tweets from url=#{url}"
|
84
84
|
get_single_page(url, headers, proxies)
|
85
85
|
end
|
86
86
|
|
87
87
|
if cache_enabled?
|
88
88
|
client = Cache.new
|
89
89
|
if (response = client.read(url))
|
90
|
-
logger.debug
|
90
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
91
91
|
else
|
92
92
|
response = http_request.call
|
93
93
|
client.write(url, response) unless stop_requested?
|
@@ -99,6 +99,12 @@ module Twitterscraper
|
|
99
99
|
|
100
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
101
101
|
|
102
|
+
if json_resp && json_resp['message']
|
103
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
104
|
+
@stop_requested = true
|
105
|
+
Cache.new.delete(url) if cache_enabled?
|
106
|
+
end
|
107
|
+
|
102
108
|
tweets = Tweet.from_html(html)
|
103
109
|
|
104
110
|
if tweets.empty?
|
@@ -107,8 +113,8 @@ module Twitterscraper
|
|
107
113
|
|
108
114
|
if json_resp
|
109
115
|
[tweets, json_resp['min_position']]
|
110
|
-
elsif
|
111
|
-
|
116
|
+
elsif type.user?
|
117
|
+
[tweets, tweets[-1].tweet_id]
|
112
118
|
else
|
113
119
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
114
120
|
end
|
@@ -116,7 +122,7 @@ module Twitterscraper
|
|
116
122
|
|
117
123
|
OLDEST_DATE = Date.parse('2006-03-21')
|
118
124
|
|
119
|
-
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
125
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
120
126
|
query = queries[0]
|
121
127
|
if query.nil? || query == ''
|
122
128
|
raise Error.new('Please specify a search query.')
|
@@ -139,19 +145,33 @@ module Twitterscraper
|
|
139
145
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
140
146
|
end
|
141
147
|
end
|
142
|
-
|
143
|
-
if end_date
|
144
|
-
today = Date.today
|
145
|
-
if end_date > Date.today
|
146
|
-
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
147
|
-
end
|
148
|
-
end
|
149
148
|
end
|
150
149
|
|
151
|
-
def build_queries(query, start_date, end_date)
|
150
|
+
def build_queries(query, start_date, end_date, threads_granularity)
|
152
151
|
if start_date && end_date
|
153
|
-
|
154
|
-
|
152
|
+
if threads_granularity == 'auto'
|
153
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
154
|
+
end
|
155
|
+
|
156
|
+
if threads_granularity == 'day'
|
157
|
+
date_range = start_date.upto(end_date - 1)
|
158
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
159
|
+
else
|
160
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
161
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
162
|
+
queries = []
|
163
|
+
|
164
|
+
while true
|
165
|
+
if time < Time.now.utc
|
166
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
167
|
+
end
|
168
|
+
time += 3600
|
169
|
+
break if time >= end_time
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
queries
|
174
|
+
|
155
175
|
elsif start_date
|
156
176
|
[query + " since:#{start_date}"]
|
157
177
|
elsif end_date
|
@@ -161,12 +181,12 @@ module Twitterscraper
|
|
161
181
|
end
|
162
182
|
end
|
163
183
|
|
164
|
-
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
184
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
165
185
|
pos = nil
|
166
186
|
daily_tweets = []
|
167
187
|
|
168
188
|
while true
|
169
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
189
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
170
190
|
unless new_tweets.empty?
|
171
191
|
daily_tweets.concat(new_tweets)
|
172
192
|
daily_tweets.uniq! { |t| t.tweet_id }
|
@@ -195,12 +215,18 @@ module Twitterscraper
|
|
195
215
|
@stop_requested
|
196
216
|
end
|
197
217
|
|
198
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
-
|
200
|
-
|
201
|
-
|
218
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
219
|
+
type = Type.new(type)
|
220
|
+
if type.search?
|
221
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
222
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
223
|
+
elsif type.user?
|
224
|
+
start_date = nil
|
225
|
+
end_date = nil
|
226
|
+
end
|
227
|
+
|
228
|
+
queries = build_queries(query, start_date, end_date, threads_granularity)
|
202
229
|
if threads > queries.size
|
203
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
204
230
|
threads = queries.size
|
205
231
|
end
|
206
232
|
if proxy_enabled?
|
@@ -212,9 +238,9 @@ module Twitterscraper
|
|
212
238
|
end
|
213
239
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
214
240
|
|
241
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
215
242
|
|
216
|
-
|
217
|
-
|
243
|
+
logger.info "The number of queries #{queries.size}"
|
218
244
|
logger.info "The number of threads #{threads}"
|
219
245
|
|
220
246
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
@@ -229,17 +255,27 @@ module Twitterscraper
|
|
229
255
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
256
|
|
231
257
|
Parallel.each(queries, in_threads: threads) do |query|
|
232
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
258
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
233
259
|
raise Parallel::Break if stop_requested?
|
234
260
|
end
|
235
261
|
else
|
236
262
|
queries.each do |query|
|
237
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
263
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
238
264
|
break if stop_requested?
|
239
265
|
end
|
240
266
|
end
|
241
267
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
269
|
+
|
242
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
243
271
|
end
|
272
|
+
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
275
|
+
end
|
276
|
+
|
277
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
279
|
+
end
|
244
280
|
end
|
245
281
|
end
|
@@ -1,48 +1,30 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
template.result_with_hash(
|
8
|
+
chart_name: name,
|
9
|
+
chart_data: chart_data(tweets).to_json,
|
10
|
+
first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
|
11
|
+
last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
|
12
|
+
tweets: tweets,
|
13
|
+
convert_limit: 30,
|
14
|
+
)
|
8
15
|
end
|
9
16
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
def chart_data(tweets)
|
18
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
19
|
+
t = tweet.created_at
|
20
|
+
min = (t.min.to_f / 5).floor * 5
|
21
|
+
time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
|
22
|
+
memo[time.to_i] += 1
|
23
|
+
end
|
15
24
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
.twitter-tweet {
|
21
|
-
margin: 30px auto 0 auto !important;
|
22
|
-
}
|
23
|
-
</style>
|
24
|
-
<script>
|
25
|
-
window.twttr = (function(d, s, id) {
|
26
|
-
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
-
if (d.getElementById(id)) return t;
|
28
|
-
js = d.createElement(s);
|
29
|
-
js.id = id;
|
30
|
-
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
-
fjs.parentNode.insertBefore(js, fjs);
|
32
|
-
|
33
|
-
t._e = [];
|
34
|
-
t.ready = function(f) {
|
35
|
-
t._e.push(f);
|
36
|
-
};
|
37
|
-
|
38
|
-
return t;
|
39
|
-
}(document, "script", "twitter-wjs"));
|
40
|
-
</script>
|
41
|
-
</head>
|
42
|
-
<body>
|
43
|
-
__TWEETS__
|
44
|
-
</body>
|
45
|
-
</html>
|
46
|
-
HTML
|
25
|
+
data.sort_by { |k, v| k }.map do |timestamp, count|
|
26
|
+
[timestamp * 1000, count]
|
27
|
+
end
|
28
|
+
end
|
47
29
|
end
|
48
30
|
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
4
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
5
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
6
|
+
<script>
|
7
|
+
function updateTweets() {
|
8
|
+
window.twttr = (function (d, s, id) {
|
9
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
10
|
+
if (d.getElementById(id)) return t;
|
11
|
+
js = d.createElement(s);
|
12
|
+
js.id = id;
|
13
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
14
|
+
fjs.parentNode.insertBefore(js, fjs);
|
15
|
+
|
16
|
+
t._e = [];
|
17
|
+
t.ready = function (f) {
|
18
|
+
t._e.push(f);
|
19
|
+
};
|
20
|
+
|
21
|
+
return t;
|
22
|
+
}(document, "script", "twitter-wjs"));
|
23
|
+
}
|
24
|
+
|
25
|
+
function drawChart() {
|
26
|
+
var data = <%= chart_data %>;
|
27
|
+
Highcharts.setOptions({
|
28
|
+
time: {
|
29
|
+
timezone: moment.tz.guess()
|
30
|
+
}
|
31
|
+
});
|
32
|
+
|
33
|
+
Highcharts.stockChart('chart', {
|
34
|
+
title: {
|
35
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
36
|
+
},
|
37
|
+
subtitle: {
|
38
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
39
|
+
},
|
40
|
+
series: [{
|
41
|
+
data: data
|
42
|
+
}],
|
43
|
+
rangeSelector: {enabled: false},
|
44
|
+
scrollbar: {enabled: false},
|
45
|
+
navigator: {enabled: false},
|
46
|
+
exporting: {enabled: false},
|
47
|
+
credits: {enabled: false}
|
48
|
+
});
|
49
|
+
}
|
50
|
+
|
51
|
+
document.addEventListener("DOMContentLoaded", function () {
|
52
|
+
drawChart();
|
53
|
+
updateTweets();
|
54
|
+
});
|
55
|
+
</script>
|
56
|
+
|
57
|
+
<style type=text/css>
|
58
|
+
.tweets-container {
|
59
|
+
max-width: 550px;
|
60
|
+
margin: 0 auto 0 auto;
|
61
|
+
}
|
62
|
+
|
63
|
+
.twitter-tweet {
|
64
|
+
margin: 15px 0 15px 0 !important;
|
65
|
+
}
|
66
|
+
</style>
|
67
|
+
</head>
|
68
|
+
<body>
|
69
|
+
<div id="chart" style="width: 100vw; height: 400px;"></div>
|
70
|
+
|
71
|
+
<div class="tweets-container">
|
72
|
+
<% tweets.each.with_index do |tweet, i| %>
|
73
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
74
|
+
<% if i < convert_limit %>
|
75
|
+
<blockquote class="twitter-tweet">
|
76
|
+
<% else %>
|
77
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
78
|
+
<% end %>
|
79
|
+
|
80
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
81
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
82
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
83
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
84
|
+
</div>
|
85
|
+
|
86
|
+
<div><%= tweet.text %></div>
|
87
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
88
|
+
|
89
|
+
<% if i < convert_limit %>
|
90
|
+
</blockquote>
|
91
|
+
<% else %>
|
92
|
+
</div>
|
93
|
+
<% end %>
|
94
|
+
<% end %>
|
95
|
+
</div>
|
96
|
+
|
97
|
+
</body>
|
98
|
+
</html>
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,7 +72,9 @@ files:
|
|
72
72
|
- lib/twitterscraper/proxy.rb
|
73
73
|
- lib/twitterscraper/query.rb
|
74
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
75
76
|
- lib/twitterscraper/tweet.rb
|
77
|
+
- lib/twitterscraper/type.rb
|
76
78
|
- lib/version.rb
|
77
79
|
- twitterscraper-ruby.gemspec
|
78
80
|
homepage: https://github.com/ts-3156/twitterscraper-ruby
|