twitterscraper-ruby 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2429cf6172b5f19caede64ac35f5c796a7c8a67e76fff8dd2f08677fb15406b
4
- data.tar.gz: 0f32ca6b559a18c4e3aac3205f6503149e372d4d7d1976b1e83db26036d9ff17
3
+ metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
4
+ data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
5
5
  SHA512:
6
- metadata.gz: a36ce6c91a363b64b36deeb3abbaaaebb725f3449f280b70be92532497a94dc5915ba449926acfacfc0d852d52471d258d41140a8891e64b6040bf262d0c347f
7
- data.tar.gz: a737c7db151190a1493b1a2a92bea304cfcf7512b2ee03fc13c6f25794f5dc727fe548e52cb39eccc2a63261fee0d58fc005920a0e7cd7650d20600e184d79cb
6
+ metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
7
+ data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.10.0)
4
+ twitterscraper-ruby (0.11.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
33
33
 
34
34
  ```shell script
35
35
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ --limit 100 --threads 10 --proxy --cache --output output.json
37
37
  ```
38
38
 
39
39
  From Within Ruby:
@@ -143,8 +143,10 @@ $ cat tweets.json | jq . | less
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
145
  | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--cache` | Enable caching. | false |
146
147
  | `--format` | The format of the output. | json |
147
148
  | `--output` | The name of the output file. | tweets.json |
149
+ | `--verbose` | Print debug messages. | tweets.json |
148
150
 
149
151
 
150
152
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,6 +2,7 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -23,7 +23,7 @@ module Twitterscraper
23
23
  threads: options['threads'],
24
24
  proxy: options['proxy']
25
25
  }
26
- client = Twitterscraper::Client.new
26
+ client = Twitterscraper::Client.new(cache: options['cache'])
27
27
  tweets = client.query_tweets(options['query'], query_options)
28
28
  export(tweets) unless tweets.empty?
29
29
  end
@@ -66,6 +66,7 @@ module Twitterscraper
66
66
  'threads:',
67
67
  'output:',
68
68
  'format:',
69
+ 'cache',
69
70
  'proxy',
70
71
  'pretty',
71
72
  'verbose',
@@ -1,5 +1,13 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache:)
6
+ @cache = cache
7
+ end
8
+
9
+ def cache_enabled?
10
+ @cache
11
+ end
4
12
  end
5
13
  end
@@ -75,9 +75,22 @@ module Twitterscraper
75
75
  query = ERB::Util.url_encode(query)
76
76
 
77
77
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
78
+ http_request = lambda do
79
+ logger.debug("Scraping tweets from #{url}")
80
+ get_single_page(url, headers, proxies)
81
+ end
79
82
 
80
- response = get_single_page(url, headers, proxies)
83
+ if cache_enabled?
84
+ client = Cache.new
85
+ if (response = client.read(url))
86
+ logger.debug('Fetching tweets from cache')
87
+ else
88
+ response = http_request.call
89
+ client.write(url, response)
90
+ end
91
+ else
92
+ response = http_request.call
93
+ end
81
94
  return [], nil if response.nil?
82
95
 
83
96
  html, json_resp = parse_single_page(response, pos.nil?)
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.10.0'
2
+ VERSION = '0.11.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -61,6 +61,7 @@ files:
61
61
  - bin/twitterscraper
62
62
  - lib/twitterscraper-ruby.rb
63
63
  - lib/twitterscraper.rb
64
+ - lib/twitterscraper/cache.rb
64
65
  - lib/twitterscraper/cli.rb
65
66
  - lib/twitterscraper/client.rb
66
67
  - lib/twitterscraper/http.rb