twitterscraper-ruby 0.13.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +31 -0
- data/Gemfile.lock +1 -1
- data/README.md +87 -56
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +9 -3
- data/lib/twitterscraper/query.rb +61 -40
- data/lib/twitterscraper/template.rb +23 -41
- data/lib/twitterscraper/template/tweets.html.erb +82 -0
- data/lib/twitterscraper/type.rb +15 -0
- data/lib/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66dda5275a9067d328f6637f127895ded954534d304e5e4b349f286a271a08d8
|
4
|
+
data.tar.gz: 6c3ffb3fba82376fc2de49514245ea96c7cb4fa16c32dcd2fff1ab1ae327bd14
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24267284f4f29adc86d5bbe70a30bbe31d6d898546576065f1a9accafc3944a352117bbf6eb0de273743a00fb2d26c5cf37ed016cc0324187a25ca279230d812
|
7
|
+
data.tar.gz: 0bc9f01659560c83b0289bf63119849135b7ec27520dd03c7abd645da99ef660ca4b5fd12301b359cd5cc45a82914d7ceae88ad93ad756fde166718b3d0fe6c2
|
@@ -0,0 +1,31 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@0.1.2
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6.4-stretch-node
|
9
|
+
environment:
|
10
|
+
BUNDLER_VERSION: 2.1.4
|
11
|
+
executor: ruby/default
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
- run:
|
15
|
+
name: Update bundler
|
16
|
+
command: gem update bundler
|
17
|
+
- run:
|
18
|
+
name: Which bundler?
|
19
|
+
command: bundle -v
|
20
|
+
- restore_cache:
|
21
|
+
keys:
|
22
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
23
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}
|
24
|
+
- gem-cache-v1
|
25
|
+
- run: bundle install --path vendor/bundle
|
26
|
+
- run: bundle clean
|
27
|
+
- save_cache:
|
28
|
+
key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
29
|
+
paths:
|
30
|
+
- vendor/bundle
|
31
|
+
- run: bundle exec rspec
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,18 +1,21 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
+
[](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
|
3
4
|
[](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
5
|
|
5
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
6
7
|
|
8
|
+
Please feel free to ask [@ts_3156](https://twitter.com/ts_3156) if you have any questions.
|
9
|
+
|
7
10
|
|
8
11
|
## Twitter Search API vs. twitterscraper-ruby
|
9
12
|
|
10
|
-
|
13
|
+
#### Twitter Search API
|
11
14
|
|
12
15
|
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
13
16
|
- The time window: the past 7 days
|
14
17
|
|
15
|
-
|
18
|
+
#### twitterscraper-ruby
|
16
19
|
|
17
20
|
- The number of tweets: Unlimited
|
18
21
|
- The time window: from 2006-3-21 to today
|
@@ -29,45 +32,92 @@ $ gem install twitterscraper-ruby
|
|
29
32
|
|
30
33
|
## Usage
|
31
34
|
|
32
|
-
Command-line interface:
|
35
|
+
#### Command-line interface:
|
36
|
+
|
37
|
+
Returns a collection of relevant tweets matching a specified query.
|
33
38
|
|
34
39
|
```shell script
|
35
|
-
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --
|
40
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
41
|
+
--limit 100 --threads 10 --output tweets.json
|
37
42
|
```
|
38
43
|
|
39
|
-
|
44
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
45
|
+
|
46
|
+
```shell script
|
47
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
48
|
+
```
|
49
|
+
|
50
|
+
#### From Within Ruby:
|
40
51
|
|
41
52
|
```ruby
|
42
53
|
require 'twitterscraper'
|
54
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
55
|
+
```
|
43
56
|
|
44
|
-
|
45
|
-
start_date: '2020-06-01',
|
46
|
-
end_date: '2020-06-30',
|
47
|
-
lang: 'ja',
|
48
|
-
limit: 100,
|
49
|
-
threads: 10,
|
50
|
-
proxy: true
|
51
|
-
}
|
57
|
+
Returns a collection of relevant tweets matching a specified query.
|
52
58
|
|
53
|
-
|
54
|
-
tweets = client.
|
59
|
+
```ruby
|
60
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
61
|
+
```
|
62
|
+
|
63
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
67
|
+
```
|
55
68
|
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
72
|
+
```shell script
|
73
|
+
$ twitterscraper --query twitter --limit 1000
|
74
|
+
$ cat tweets.json | jq . | less
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Attributes
|
79
|
+
|
80
|
+
### Tweet
|
81
|
+
|
82
|
+
```ruby
|
56
83
|
tweets.each do |tweet|
|
57
84
|
puts tweet.tweet_id
|
58
85
|
puts tweet.text
|
59
86
|
puts tweet.tweet_url
|
60
87
|
puts tweet.created_at
|
61
88
|
|
89
|
+
attr_names = hash.keys
|
62
90
|
hash = tweet.attrs
|
63
|
-
|
91
|
+
json = tweet.to_json
|
64
92
|
end
|
65
93
|
```
|
66
94
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
95
|
+
```json
|
96
|
+
[
|
97
|
+
{
|
98
|
+
"screen_name": "@name",
|
99
|
+
"name": "Name",
|
100
|
+
"user_id": 12340000,
|
101
|
+
"tweet_id": 1234000000000000,
|
102
|
+
"text": "Thanks Twitter!",
|
103
|
+
"links": [],
|
104
|
+
"hashtags": [],
|
105
|
+
"image_urls": [],
|
106
|
+
"video_url": null,
|
107
|
+
"has_media": null,
|
108
|
+
"likes": 10,
|
109
|
+
"retweets": 20,
|
110
|
+
"replies": 0,
|
111
|
+
"is_replied": false,
|
112
|
+
"is_reply_to": false,
|
113
|
+
"parent_tweet_id": null,
|
114
|
+
"reply_to_users": [],
|
115
|
+
"tweet_url": "https://twitter.com/name/status/1234000000000000",
|
116
|
+
"timestamp": 1594793000,
|
117
|
+
"created_at": "2020-07-15 00:00:00 +0000"
|
118
|
+
}
|
119
|
+
]
|
120
|
+
```
|
71
121
|
|
72
122
|
- screen_name
|
73
123
|
- name
|
@@ -110,43 +160,24 @@ end
|
|
110
160
|
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
161
|
|
112
162
|
|
113
|
-
## Examples
|
114
|
-
|
115
|
-
```shell script
|
116
|
-
$ twitterscraper --query twitter --limit 1000
|
117
|
-
$ cat tweets.json | jq . | less
|
118
|
-
```
|
119
|
-
|
120
|
-
```json
|
121
|
-
[
|
122
|
-
{
|
123
|
-
"screen_name": "@screenname",
|
124
|
-
"name": "name",
|
125
|
-
"user_id": 1194529546483000000,
|
126
|
-
"tweet_id": 1282659891992000000,
|
127
|
-
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
128
|
-
"created_at": "2020-07-13 12:00:00 +0000",
|
129
|
-
"text": "Thanks Twitter!"
|
130
|
-
}
|
131
|
-
]
|
132
|
-
```
|
133
|
-
|
134
163
|
## CLI Options
|
135
164
|
|
136
|
-
| Option | Description |
|
137
|
-
| ------------- | ------------- | ------------- |
|
138
|
-
|
|
139
|
-
| `--
|
140
|
-
| `--
|
141
|
-
| `--
|
142
|
-
| `--
|
143
|
-
| `--
|
144
|
-
| `--
|
145
|
-
| `--
|
146
|
-
| `--
|
147
|
-
| `--
|
148
|
-
| `--
|
149
|
-
| `--
|
165
|
+
| Option | Type | Description | Value |
|
166
|
+
| ------------- | ------------- | ------------- | ------------- |
|
167
|
+
| `--help` | string | This option displays a summary of twitterscraper. | |
|
168
|
+
| `--type` | string | Specify a search type. | search(default) or user |
|
169
|
+
| `--query` | string | Specify a keyword used during the search. | |
|
170
|
+
| `--start_date` | string | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
171
|
+
| `--end_date` | string | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
172
|
+
| `--lang` | string | Retrieve tweets written in a specific language. | |
|
173
|
+
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
|
+
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
|
+
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
176
|
+
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
|
+
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
|
+
| `--format` | string | The format of the output. | json(default) or html |
|
179
|
+
| `--output` | string | The name of the output file. | tweets.json |
|
180
|
+
| `--verbose` | | Print debug messages. | |
|
150
181
|
|
151
182
|
|
152
183
|
## Contributing
|
data/lib/twitterscraper.rb
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 # 1 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def delete(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.delete(file) if File.exist?(file)
|
32
|
+
end
|
33
|
+
|
28
34
|
def fetch(key, &block)
|
29
35
|
if (value = read(key))
|
30
36
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,25 +16,27 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
22
23
|
limit: options['limit'],
|
23
24
|
daily_limit: options['daily_limit'],
|
25
|
+
order: options['order'],
|
24
26
|
threads: options['threads'],
|
25
27
|
}
|
26
28
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
29
|
tweets = client.query_tweets(options['query'], query_options)
|
28
|
-
export(tweets) unless tweets.empty?
|
30
|
+
export(options['query'], tweets) unless tweets.empty?
|
29
31
|
end
|
30
32
|
|
31
|
-
def export(tweets)
|
33
|
+
def export(name, tweets)
|
32
34
|
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
33
35
|
|
34
36
|
if options['format'] == 'json'
|
35
37
|
write_json.call
|
36
38
|
elsif options['format'] == 'html'
|
37
|
-
File.write('
|
39
|
+
File.write(options['output'], Template.new.tweets_embedded_html(name, tweets, options))
|
38
40
|
else
|
39
41
|
write_json.call
|
40
42
|
end
|
@@ -58,12 +60,14 @@ module Twitterscraper
|
|
58
60
|
'help',
|
59
61
|
'v',
|
60
62
|
'version',
|
63
|
+
'type:',
|
61
64
|
'query:',
|
62
65
|
'start_date:',
|
63
66
|
'end_date:',
|
64
67
|
'lang:',
|
65
68
|
'limit:',
|
66
69
|
'daily_limit:',
|
70
|
+
'order:',
|
67
71
|
'threads:',
|
68
72
|
'output:',
|
69
73
|
'format:',
|
@@ -73,12 +77,14 @@ module Twitterscraper
|
|
73
77
|
'verbose',
|
74
78
|
)
|
75
79
|
|
80
|
+
options['type'] ||= 'search'
|
76
81
|
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
77
82
|
options['lang'] ||= ''
|
78
83
|
options['limit'] = (options['limit'] || 100).to_i
|
79
84
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
80
85
|
options['threads'] = (options['threads'] || 2).to_i
|
81
86
|
options['format'] ||= 'json'
|
87
|
+
options['order'] ||= 'desc'
|
82
88
|
options['output'] ||= "tweets.#{options['format']}"
|
83
89
|
|
84
90
|
options['cache'] = options['cache'] != 'false'
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,23 +22,24 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, type, pos)
|
31
|
+
if type.user?
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -50,7 +51,7 @@ module Twitterscraper
|
|
50
51
|
end
|
51
52
|
Http.get(url, headers, proxy, timeout)
|
52
53
|
rescue => e
|
53
|
-
logger.debug "
|
54
|
+
logger.debug "get_single_page: #{e.inspect}"
|
54
55
|
if (retries -= 1) > 0
|
55
56
|
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
56
57
|
retry
|
@@ -68,17 +69,16 @@ module Twitterscraper
|
|
68
69
|
else
|
69
70
|
json_resp = JSON.parse(text)
|
70
71
|
items_html = json_resp['items_html'] || ''
|
71
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
72
72
|
end
|
73
73
|
|
74
74
|
[items_html, json_resp]
|
75
75
|
end
|
76
76
|
|
77
|
-
def query_single_page(query, lang,
|
77
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
78
78
|
logger.info "Querying #{query}"
|
79
79
|
query = ERB::Util.url_encode(query)
|
80
80
|
|
81
|
-
url = build_query_url(query, lang,
|
81
|
+
url = build_query_url(query, lang, type, pos)
|
82
82
|
http_request = lambda do
|
83
83
|
logger.debug "Scraping tweets from #{url}"
|
84
84
|
get_single_page(url, headers, proxies)
|
@@ -99,6 +99,12 @@ module Twitterscraper
|
|
99
99
|
|
100
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
101
101
|
|
102
|
+
if json_resp && json_resp['message']
|
103
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
104
|
+
@stop_requested = true
|
105
|
+
Cache.new.delete(url) if cache_enabled?
|
106
|
+
end
|
107
|
+
|
102
108
|
tweets = Tweet.from_html(html)
|
103
109
|
|
104
110
|
if tweets.empty?
|
@@ -107,8 +113,8 @@ module Twitterscraper
|
|
107
113
|
|
108
114
|
if json_resp
|
109
115
|
[tweets, json_resp['min_position']]
|
110
|
-
elsif
|
111
|
-
|
116
|
+
elsif type.user?
|
117
|
+
[tweets, tweets[-1].tweet_id]
|
112
118
|
else
|
113
119
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
114
120
|
end
|
@@ -116,7 +122,7 @@ module Twitterscraper
|
|
116
122
|
|
117
123
|
OLDEST_DATE = Date.parse('2006-03-21')
|
118
124
|
|
119
|
-
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
125
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
120
126
|
query = queries[0]
|
121
127
|
if query.nil? || query == ''
|
122
128
|
raise Error.new('Please specify a search query.')
|
@@ -139,19 +145,27 @@ module Twitterscraper
|
|
139
145
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
140
146
|
end
|
141
147
|
end
|
142
|
-
|
143
|
-
if end_date
|
144
|
-
today = Date.today
|
145
|
-
if end_date > Date.today
|
146
|
-
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
147
|
-
end
|
148
|
-
end
|
149
148
|
end
|
150
149
|
|
151
150
|
def build_queries(query, start_date, end_date)
|
152
151
|
if start_date && end_date
|
153
|
-
date_range = start_date.upto(end_date - 1)
|
154
|
-
date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
152
|
+
# date_range = start_date.upto(end_date - 1)
|
153
|
+
# date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
154
|
+
|
155
|
+
queries = []
|
156
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
157
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
158
|
+
|
159
|
+
while true
|
160
|
+
if time < Time.now.utc
|
161
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
162
|
+
end
|
163
|
+
time += 3600
|
164
|
+
break if time >= end_time
|
165
|
+
end
|
166
|
+
|
167
|
+
queries
|
168
|
+
|
155
169
|
elsif start_date
|
156
170
|
[query + " since:#{start_date}"]
|
157
171
|
elsif end_date
|
@@ -161,12 +175,12 @@ module Twitterscraper
|
|
161
175
|
end
|
162
176
|
end
|
163
177
|
|
164
|
-
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
178
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
165
179
|
pos = nil
|
166
180
|
daily_tweets = []
|
167
181
|
|
168
182
|
while true
|
169
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
183
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
170
184
|
unless new_tweets.empty?
|
171
185
|
daily_tweets.concat(new_tweets)
|
172
186
|
daily_tweets.uniq! { |t| t.tweet_id }
|
@@ -195,12 +209,12 @@ module Twitterscraper
|
|
195
209
|
@stop_requested
|
196
210
|
end
|
197
211
|
|
198
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
212
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
199
213
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
200
214
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
201
215
|
queries = build_queries(query, start_date, end_date)
|
216
|
+
type = Type.new(type)
|
202
217
|
if threads > queries.size
|
203
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
204
218
|
threads = queries.size
|
205
219
|
end
|
206
220
|
if proxy_enabled?
|
@@ -212,8 +226,7 @@ module Twitterscraper
|
|
212
226
|
end
|
213
227
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
214
228
|
|
215
|
-
|
216
|
-
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
229
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
230
|
|
218
231
|
logger.info "The number of threads #{threads}"
|
219
232
|
|
@@ -229,17 +242,25 @@ module Twitterscraper
|
|
229
242
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
243
|
|
231
244
|
Parallel.each(queries, in_threads: threads) do |query|
|
232
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
245
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
233
246
|
raise Parallel::Break if stop_requested?
|
234
247
|
end
|
235
248
|
else
|
236
249
|
queries.each do |query|
|
237
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
250
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
238
251
|
break if stop_requested?
|
239
252
|
end
|
240
253
|
end
|
241
254
|
|
242
|
-
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
255
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
256
|
+
end
|
257
|
+
|
258
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
259
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
260
|
+
end
|
261
|
+
|
262
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
263
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
243
264
|
end
|
244
265
|
end
|
245
266
|
end
|
@@ -1,48 +1,30 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
template.result_with_hash(
|
8
|
+
chart_name: name,
|
9
|
+
chart_data: chart_data(tweets).to_json,
|
10
|
+
first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
|
11
|
+
last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
|
12
|
+
tweets_size: tweets.size,
|
13
|
+
tweets: tweets.take(50)
|
14
|
+
)
|
8
15
|
end
|
9
16
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
def chart_data(tweets)
|
18
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
19
|
+
t = tweet.created_at
|
20
|
+
min = (t.min.to_f / 5).floor * 5
|
21
|
+
time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
|
22
|
+
memo[time.to_i] += 1
|
23
|
+
end
|
15
24
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
.twitter-tweet {
|
21
|
-
margin: 30px auto 0 auto !important;
|
22
|
-
}
|
23
|
-
</style>
|
24
|
-
<script>
|
25
|
-
window.twttr = (function(d, s, id) {
|
26
|
-
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
-
if (d.getElementById(id)) return t;
|
28
|
-
js = d.createElement(s);
|
29
|
-
js.id = id;
|
30
|
-
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
-
fjs.parentNode.insertBefore(js, fjs);
|
32
|
-
|
33
|
-
t._e = [];
|
34
|
-
t.ready = function(f) {
|
35
|
-
t._e.push(f);
|
36
|
-
};
|
37
|
-
|
38
|
-
return t;
|
39
|
-
}(document, "script", "twitter-wjs"));
|
40
|
-
</script>
|
41
|
-
</head>
|
42
|
-
<body>
|
43
|
-
__TWEETS__
|
44
|
-
</body>
|
45
|
-
</html>
|
46
|
-
HTML
|
25
|
+
data.sort_by { |k, v| k }.map do |timestamp, count|
|
26
|
+
[timestamp * 1000, count]
|
27
|
+
end
|
28
|
+
end
|
47
29
|
end
|
48
30
|
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<script>
|
4
|
+
window.twttr = (function (d, s, id) {
|
5
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
6
|
+
if (d.getElementById(id)) return t;
|
7
|
+
js = d.createElement(s);
|
8
|
+
js.id = id;
|
9
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
10
|
+
fjs.parentNode.insertBefore(js, fjs);
|
11
|
+
|
12
|
+
t._e = [];
|
13
|
+
t.ready = function (f) {
|
14
|
+
t._e.push(f);
|
15
|
+
};
|
16
|
+
|
17
|
+
return t;
|
18
|
+
}(document, "script", "twitter-wjs"));
|
19
|
+
</script>
|
20
|
+
|
21
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
22
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
23
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
24
|
+
<script>
|
25
|
+
function drawChart() {
|
26
|
+
Highcharts.setOptions({
|
27
|
+
time: {
|
28
|
+
timezone: moment.tz.guess()
|
29
|
+
}
|
30
|
+
});
|
31
|
+
|
32
|
+
Highcharts.stockChart('chart', {
|
33
|
+
title: {
|
34
|
+
text: '<%= tweets_size %> tweets of <%= chart_name %>'
|
35
|
+
},
|
36
|
+
subtitle: {
|
37
|
+
text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
|
38
|
+
},
|
39
|
+
series: [{
|
40
|
+
data: <%= chart_data %>
|
41
|
+
}],
|
42
|
+
rangeSelector: {enabled: false},
|
43
|
+
scrollbar: {enabled: false},
|
44
|
+
navigator: {enabled: false},
|
45
|
+
exporting: {enabled: false},
|
46
|
+
credits: {enabled: false}
|
47
|
+
});
|
48
|
+
}
|
49
|
+
|
50
|
+
document.addEventListener("DOMContentLoaded", function () {
|
51
|
+
drawChart();
|
52
|
+
});
|
53
|
+
</script>
|
54
|
+
|
55
|
+
<style type=text/css>
|
56
|
+
.tweets-container {
|
57
|
+
max-width: 550px;
|
58
|
+
margin: 0 auto 0 auto;
|
59
|
+
}
|
60
|
+
|
61
|
+
.twitter-tweet {
|
62
|
+
margin: 15px 0 15px 0 !important;
|
63
|
+
}
|
64
|
+
</style>
|
65
|
+
</head>
|
66
|
+
<body>
|
67
|
+
<div id="chart"></div>
|
68
|
+
|
69
|
+
<div class="tweets-container">
|
70
|
+
<% tweets.each do |tweet| %>
|
71
|
+
<blockquote class="twitter-tweet">
|
72
|
+
<a href="<%= tweet.tweet_url %>"></a>
|
73
|
+
</blockquote>
|
74
|
+
<% end %>
|
75
|
+
|
76
|
+
<% if tweets_size > tweets.size %>
|
77
|
+
<div>and more!</div>
|
78
|
+
<% end %>
|
79
|
+
</div>
|
80
|
+
|
81
|
+
</body>
|
82
|
+
</html>
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -46,6 +46,7 @@ executables:
|
|
46
46
|
extensions: []
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
+
- ".circleci/config.yml"
|
49
50
|
- ".gitignore"
|
50
51
|
- ".irbrc"
|
51
52
|
- ".rspec"
|
@@ -71,7 +72,9 @@ files:
|
|
71
72
|
- lib/twitterscraper/proxy.rb
|
72
73
|
- lib/twitterscraper/query.rb
|
73
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
74
76
|
- lib/twitterscraper/tweet.rb
|
77
|
+
- lib/twitterscraper/type.rb
|
75
78
|
- lib/version.rb
|
76
79
|
- twitterscraper-ruby.gemspec
|
77
80
|
homepage: https://github.com/ts-3156/twitterscraper-ruby
|