twitterscraper-ruby 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +3 -1
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +2 -1
- data/lib/twitterscraper/client.rb +8 -0
- data/lib/twitterscraper/query.rb +15 -2
- data/lib/twitterscraper/tweet.rb +8 -0
- data/lib/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
|
4
|
+
data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
|
7
|
+
data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
|
|
33
33
|
|
34
34
|
```shell script
|
35
35
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --proxy --output output.json
|
36
|
+
--limit 100 --threads 10 --proxy --cache --output output.json
|
37
37
|
```
|
38
38
|
|
39
39
|
From Within Ruby:
|
@@ -143,8 +143,10 @@ $ cat tweets.json | jq . | less
|
|
143
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
145
145
|
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--cache` | Enable caching. | false |
|
146
147
|
| `--format` | The format of the output. | json |
|
147
148
|
| `--output` | The name of the output file. | tweets.json |
|
149
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
148
150
|
|
149
151
|
|
150
152
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -23,7 +23,7 @@ module Twitterscraper
|
|
23
23
|
threads: options['threads'],
|
24
24
|
proxy: options['proxy']
|
25
25
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
26
|
+
client = Twitterscraper::Client.new(cache: options['cache'])
|
27
27
|
tweets = client.query_tweets(options['query'], query_options)
|
28
28
|
export(tweets) unless tweets.empty?
|
29
29
|
end
|
@@ -66,6 +66,7 @@ module Twitterscraper
|
|
66
66
|
'threads:',
|
67
67
|
'output:',
|
68
68
|
'format:',
|
69
|
+
'cache',
|
69
70
|
'proxy',
|
70
71
|
'pretty',
|
71
72
|
'verbose',
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -75,9 +75,22 @@ module Twitterscraper
|
|
75
75
|
query = ERB::Util.url_encode(query)
|
76
76
|
|
77
77
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
78
|
+
http_request = lambda do
|
79
|
+
logger.debug("Scraping tweets from #{url}")
|
80
|
+
get_single_page(url, headers, proxies)
|
81
|
+
end
|
79
82
|
|
80
|
-
|
83
|
+
if cache_enabled?
|
84
|
+
client = Cache.new
|
85
|
+
if (response = client.read(url))
|
86
|
+
logger.debug('Fetching tweets from cache')
|
87
|
+
else
|
88
|
+
response = http_request.call
|
89
|
+
client.write(url, response)
|
90
|
+
end
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
end
|
81
94
|
return [], nil if response.nil?
|
82
95
|
|
83
96
|
html, json_resp = parse_single_page(response, pos.nil?)
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -43,6 +43,14 @@ module Twitterscraper
|
|
43
43
|
end
|
44
44
|
|
45
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
46
54
|
def from_html(text)
|
47
55
|
html = Nokogiri::HTML(text)
|
48
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -61,6 +61,7 @@ files:
|
|
61
61
|
- bin/twitterscraper
|
62
62
|
- lib/twitterscraper-ruby.rb
|
63
63
|
- lib/twitterscraper.rb
|
64
|
+
- lib/twitterscraper/cache.rb
|
64
65
|
- lib/twitterscraper/cli.rb
|
65
66
|
- lib/twitterscraper/client.rb
|
66
67
|
- lib/twitterscraper/http.rb
|