twitterscraper-ruby 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +3 -1
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +2 -1
- data/lib/twitterscraper/client.rb +8 -0
- data/lib/twitterscraper/query.rb +15 -2
- data/lib/twitterscraper/tweet.rb +8 -0
- data/lib/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
|
4
|
+
data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
|
7
|
+
data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
|
|
33
33
|
|
34
34
|
```shell script
|
35
35
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --proxy --output output.json
|
36
|
+
--limit 100 --threads 10 --proxy --cache --output output.json
|
37
37
|
```
|
38
38
|
|
39
39
|
From Within Ruby:
|
@@ -143,8 +143,10 @@ $ cat tweets.json | jq . | less
|
|
143
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
145
145
|
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--cache` | Enable caching. | false |
|
146
147
|
| `--format` | The format of the output. | json |
|
147
148
|
| `--output` | The name of the output file. | tweets.json |
|
149
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
148
150
|
|
149
151
|
|
150
152
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -23,7 +23,7 @@ module Twitterscraper
|
|
23
23
|
threads: options['threads'],
|
24
24
|
proxy: options['proxy']
|
25
25
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
26
|
+
client = Twitterscraper::Client.new(cache: options['cache'])
|
27
27
|
tweets = client.query_tweets(options['query'], query_options)
|
28
28
|
export(tweets) unless tweets.empty?
|
29
29
|
end
|
@@ -66,6 +66,7 @@ module Twitterscraper
|
|
66
66
|
'threads:',
|
67
67
|
'output:',
|
68
68
|
'format:',
|
69
|
+
'cache',
|
69
70
|
'proxy',
|
70
71
|
'pretty',
|
71
72
|
'verbose',
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -75,9 +75,22 @@ module Twitterscraper
|
|
75
75
|
query = ERB::Util.url_encode(query)
|
76
76
|
|
77
77
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
78
|
+
http_request = lambda do
|
79
|
+
logger.debug("Scraping tweets from #{url}")
|
80
|
+
get_single_page(url, headers, proxies)
|
81
|
+
end
|
79
82
|
|
80
|
-
|
83
|
+
if cache_enabled?
|
84
|
+
client = Cache.new
|
85
|
+
if (response = client.read(url))
|
86
|
+
logger.debug('Fetching tweets from cache')
|
87
|
+
else
|
88
|
+
response = http_request.call
|
89
|
+
client.write(url, response)
|
90
|
+
end
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
end
|
81
94
|
return [], nil if response.nil?
|
82
95
|
|
83
96
|
html, json_resp = parse_single_page(response, pos.nil?)
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -43,6 +43,14 @@ module Twitterscraper
|
|
43
43
|
end
|
44
44
|
|
45
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
46
54
|
def from_html(text)
|
47
55
|
html = Nokogiri::HTML(text)
|
48
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -61,6 +61,7 @@ files:
|
|
61
61
|
- bin/twitterscraper
|
62
62
|
- lib/twitterscraper-ruby.rb
|
63
63
|
- lib/twitterscraper.rb
|
64
|
+
- lib/twitterscraper/cache.rb
|
64
65
|
- lib/twitterscraper/cli.rb
|
65
66
|
- lib/twitterscraper/client.rb
|
66
67
|
- lib/twitterscraper/http.rb
|