twitterscraper-ruby 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2429cf6172b5f19caede64ac35f5c796a7c8a67e76fff8dd2f08677fb15406b
4
- data.tar.gz: 0f32ca6b559a18c4e3aac3205f6503149e372d4d7d1976b1e83db26036d9ff17
3
+ metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
4
+ data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
5
5
  SHA512:
6
- metadata.gz: a36ce6c91a363b64b36deeb3abbaaaebb725f3449f280b70be92532497a94dc5915ba449926acfacfc0d852d52471d258d41140a8891e64b6040bf262d0c347f
7
- data.tar.gz: a737c7db151190a1493b1a2a92bea304cfcf7512b2ee03fc13c6f25794f5dc727fe548e52cb39eccc2a63261fee0d58fc005920a0e7cd7650d20600e184d79cb
6
+ metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
7
+ data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
data/.gitignore CHANGED
@@ -6,5 +6,5 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
-
9
+ /cache
10
10
  /.idea
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.10.0)
4
+ twitterscraper-ruby (0.11.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
33
33
 
34
34
  ```shell script
35
35
  $ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
36
- --limit 100 --threads 10 --proxy --output output.json
36
+ --limit 100 --threads 10 --proxy --cache --output output.json
37
37
  ```
38
38
 
39
39
  From Within Ruby:
@@ -143,8 +143,10 @@ $ cat tweets.json | jq . | less
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
145
145
  | `--proxy` | Scrape https://twitter.com/search via proxies. | false |
146
+ | `--cache` | Enable caching. | false |
146
147
  | `--format` | The format of the output. | json |
147
148
  | `--output` | The name of the output file. | tweets.json |
149
+ | `--verbose` | Print debug messages. | tweets.json |
148
150
 
149
151
 
150
152
  ## Contributing
@@ -7,7 +7,7 @@ begin
7
7
  cli.parse
8
8
  cli.run
9
9
  rescue => e
10
- STDERR.puts e.message
10
+ STDERR.puts e.inspect
11
11
  STDERR.puts e.backtrace.join("\n")
12
12
  exit 1
13
13
  end
@@ -2,6 +2,7 @@ require 'twitterscraper/logger'
2
2
  require 'twitterscraper/proxy'
3
3
  require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
5
6
  require 'twitterscraper/query'
6
7
  require 'twitterscraper/client'
7
8
  require 'twitterscraper/tweet'
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -23,7 +23,7 @@ module Twitterscraper
23
23
  threads: options['threads'],
24
24
  proxy: options['proxy']
25
25
  }
26
- client = Twitterscraper::Client.new
26
+ client = Twitterscraper::Client.new(cache: options['cache'])
27
27
  tweets = client.query_tweets(options['query'], query_options)
28
28
  export(tweets) unless tweets.empty?
29
29
  end
@@ -66,6 +66,7 @@ module Twitterscraper
66
66
  'threads:',
67
67
  'output:',
68
68
  'format:',
69
+ 'cache',
69
70
  'proxy',
70
71
  'pretty',
71
72
  'verbose',
@@ -1,5 +1,13 @@
1
1
  module Twitterscraper
2
2
  class Client
3
3
  include Query
4
+
5
+ def initialize(cache:)
6
+ @cache = cache
7
+ end
8
+
9
+ def cache_enabled?
10
+ @cache
11
+ end
4
12
  end
5
13
  end
@@ -75,9 +75,22 @@ module Twitterscraper
75
75
  query = ERB::Util.url_encode(query)
76
76
 
77
77
  url = build_query_url(query, lang, pos, from_user)
78
- logger.debug("Scraping tweets from #{url}")
78
+ http_request = lambda do
79
+ logger.debug("Scraping tweets from #{url}")
80
+ get_single_page(url, headers, proxies)
81
+ end
79
82
 
80
- response = get_single_page(url, headers, proxies)
83
+ if cache_enabled?
84
+ client = Cache.new
85
+ if (response = client.read(url))
86
+ logger.debug('Fetching tweets from cache')
87
+ else
88
+ response = http_request.call
89
+ client.write(url, response)
90
+ end
91
+ else
92
+ response = http_request.call
93
+ end
81
94
  return [], nil if response.nil?
82
95
 
83
96
  html, json_resp = parse_single_page(response, pos.nil?)
@@ -43,6 +43,14 @@ module Twitterscraper
43
43
  end
44
44
 
45
45
  class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
46
54
  def from_html(text)
47
55
  html = Nokogiri::HTML(text)
48
56
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.10.0'
2
+ VERSION = '0.11.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-13 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -61,6 +61,7 @@ files:
61
61
  - bin/twitterscraper
62
62
  - lib/twitterscraper-ruby.rb
63
63
  - lib/twitterscraper.rb
64
+ - lib/twitterscraper/cache.rb
64
65
  - lib/twitterscraper/cli.rb
65
66
  - lib/twitterscraper/client.rb
66
67
  - lib/twitterscraper/http.rb