twitterscraper-ruby 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b968497864febb3a245e9d3ce42cc7fe948bfc3c1995401597cfcadca352b46
4
- data.tar.gz: fd23f719a5dd760a397e936bd9cbea3a8c4d9de9b380728f954da11b753f0531
3
+ metadata.gz: 6791ebfd82694e768350ec33a19d3a34336c26c5344e57ef92af6bf02a0dddf1
4
+ data.tar.gz: 351bf02ad483c60993114a828a4a1e39b936ab8e1373237aa66de6d0a3c809a6
5
5
  SHA512:
6
- metadata.gz: a080abd711c46c34d366525acbf1f7f41db5b6c81110be0d615bf9a73401129328f24536278da96f54e806be910e4ac7e6f9dbd74a4a0bebac6352d5c808a7e2
7
- data.tar.gz: d49db2835fb8ccb4b491bea050c2fd0024569b3788c873c126ce62b27951362223c381b81d9d8890281bb50fff26b2c923c3040dee6b1a0f35394c0bc236d3a2
6
+ metadata.gz: 236d01eaaf4ed8c5c016fff35b5794e1609e840d8d27edfba92e0fc63138dfced1a10f4e2952d919360d9cc5111bc4987422a9e709c3e798434614b734b3b029
7
+ data.tar.gz: 6831c32b358651e8c75af0772afbd0f2888934e5ef314112ecaa2dab1bcaeb681dc6a350d473eab79e36a83da57059b35dd88d693cb3a2f894789cb03ceb1e8c
@@ -0,0 +1 @@
1
+ 2.6.4
@@ -1,12 +1,16 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.1.0)
4
+ twitterscraper-ruby (0.2.0)
5
+ nokogiri
5
6
 
6
7
  GEM
7
8
  remote: https://rubygems.org/
8
9
  specs:
10
+ mini_portile2 (2.4.0)
9
11
  minitest (5.14.1)
12
+ nokogiri (1.10.10)
13
+ mini_portile2 (~> 2.4.0)
10
14
  rake (12.3.3)
11
15
 
12
16
  PLATFORMS
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Twitterscraper::Ruby
1
+ # twitterscraper-ruby
2
2
 
3
3
  Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/twitterscraper/ruby`. To experiment with that code, run `bin/console` for an interactive prompt.
4
4
 
@@ -22,7 +22,9 @@ Or install it yourself as:
22
22
 
23
23
  ## Usage
24
24
 
25
- TODO: Write usage instructions here
25
+ ```ruby
26
+ require 'twitterscraper'
27
+ ```
26
28
 
27
29
  ## Development
28
30
 
@@ -1,6 +1,26 @@
1
- require "version"
1
+ require 'twitterscraper/logger'
2
+ require 'twitterscraper/proxy'
3
+ require 'twitterscraper/http'
4
+ require 'twitterscraper/lang'
5
+ require 'twitterscraper/query'
6
+ require 'twitterscraper/client'
7
+ require 'twitterscraper/tweet'
8
+ require 'version'
2
9
 
3
10
  module Twitterscraper
4
11
  class Error < StandardError; end
5
12
  # Your code goes here...
13
+
14
+ def self.logger
15
+ @logger ||= ::Logger.new(STDOUT)
16
+ end
17
+
18
+ def self.logger=(logger)
19
+ if logger.nil?
20
+ self.logger.level = ::Logger::FATAL
21
+ return self.logger
22
+ end
23
+
24
+ @logger = logger
25
+ end
6
26
  end
@@ -0,0 +1,5 @@
1
+ module Twitterscraper
2
+ class Client
3
+ include Query
4
+ end
5
+ end
@@ -0,0 +1,30 @@
1
+ module Twitterscraper
2
+ module Http
3
+
4
+ module_function
5
+
6
+ def get(url, headers = {}, proxy = nil, timeout = nil)
7
+ timeout ||= 3
8
+
9
+ if proxy
10
+ ip, port = proxy.split(':')
11
+ http_class = Net::HTTP::Proxy(ip, port.to_i)
12
+ else
13
+ http_class = Net::HTTP
14
+ end
15
+
16
+ uri = URI.parse(url)
17
+ http = http_class.new(uri.host, uri.port)
18
+ http.use_ssl = true if url.match?(/^https/)
19
+ http.open_timeout = timeout
20
+ http.read_timeout = timeout
21
+ req = Net::HTTP::Get.new(uri)
22
+
23
+ headers.each do |key, value|
24
+ req[key] = value
25
+ end
26
+
27
+ http.request(req).body
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,40 @@
1
+ module Twitterscraper
2
+ class Lang
3
+ LIST = [
4
+ 'en', # English
5
+ 'ar', # Arabic
6
+ 'bn', # Bengali
7
+ 'cs', # Czech
8
+ 'da', # Danish
9
+ 'de', # German
10
+ 'el', # Greek
11
+ 'es', # Spanish
12
+ 'fa', # Persian
13
+ 'fi', # Finnish
14
+ 'fil', # Filipino
15
+ 'fr', # French
16
+ 'he', # Hebrew
17
+ 'hi', # Hindi
18
+ 'hu', # Hungarian
19
+ 'id', # Indonesian
20
+ 'it', # Italian
21
+ 'ja', # Japanese
22
+ 'ko', # Korean
23
+ 'msa', # Malay
24
+ 'nl', # Dutch
25
+ 'no', # Norwegian
26
+ 'pl', # Polish
27
+ 'pt', # Portuguese
28
+ 'ro', # Romanian
29
+ 'ru', # Russian
30
+ 'sv', # Swedish
31
+ 'th', # Thai
32
+ 'tr', # Turkish
33
+ 'uk', # Ukranian
34
+ 'ur', # Urdu
35
+ 'vi', # Vietnamese
36
+ 'zh-cn', # Chinese Simplified
37
+ 'zh-tw', # Chinese Traditional
38
+ ]
39
+ end
40
+ end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ module Twitterscraper
4
+ module Logger
5
+ def logger
6
+ Twitterscraper.logger
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,33 @@
1
+ module Twitterscraper
2
+ module Proxy
3
+
4
+ PROXY_URL = 'https://free-proxy-list.net/'
5
+
6
+ class RetryExhausted < StandardError
7
+ end
8
+
9
+ module_function
10
+
11
+ def get_proxies(retries = 3)
12
+ response = Twitterscraper::Http.get(PROXY_URL)
13
+ html = Nokogiri::HTML(response)
14
+ table = html.xpath('//*[@id="proxylisttable"]').first
15
+
16
+ proxies = []
17
+
18
+ table.xpath('tbody/tr').each do |tr|
19
+ cells = tr.xpath('td')
20
+ ip, port = cells[0].text.strip, cells[1].text.strip
21
+ proxies << ip + ':' + port
22
+ end
23
+
24
+ proxies
25
+ rescue => e
26
+ if (retries -= 1) > 0
27
+ retry
28
+ else
29
+ raise RetryExhausted.new(e.inspect)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,147 @@
1
+ require 'net/http'
2
+ require 'nokogiri'
3
+ require 'date'
4
+ require 'json'
5
+
6
+ module Twitterscraper
7
+ module Query
8
+ include Logger
9
+
10
+ USER_AGENT_LIST = [
11
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
12
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
13
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
14
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
15
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
16
+ ]
17
+ USER_AGENT = USER_AGENT_LIST.sample
18
+
19
+ INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
20
+ RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
21
+ 'default&include_available_features=1&include_entities=1&' +
22
+ 'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
23
+ INIT_URL_USER = 'https://twitter.com/{u}'
24
+ RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/{u}/timeline/tweets?' +
25
+ 'include_available_features=1&include_entities=1&' +
26
+ 'max_position={pos}&reset_error_state=false'
27
+
28
+ def get_query_url(query, lang, pos, from_user = false)
29
+ # if from_user
30
+ # if !pos
31
+ # INIT_URL_USER.format(u = query)
32
+ # else
33
+ # RELOAD_URL_USER.format(u = query, pos = pos)
34
+ # end
35
+ # end
36
+ if pos
37
+ RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
38
+ else
39
+ INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
40
+ end
41
+ end
42
+
43
+ def query_single_page(query, lang, pos, retries = 30, from_user = false, timeout = 3, headers: [], proxies: [])
44
+ query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
45
+ logger.info("Querying #{query}")
46
+
47
+ url = get_query_url(query, lang, pos, from_user)
48
+ logger.debug("Scraping tweets from #{url}")
49
+
50
+ response = nil
51
+ begin
52
+ proxy = proxies.sample
53
+ logger.info("Using proxy #{proxy}")
54
+
55
+ response = Twitterscraper::Http.get(url, headers, proxy, timeout)
56
+ rescue => e
57
+ logger.debug "query_single_page: #{e.inspect}"
58
+ if (retries -= 1) > 0
59
+ logger.info("Retrying... (Attempts left: #{retries - 1})")
60
+ retry
61
+ else
62
+ raise
63
+ end
64
+ end
65
+
66
+ html = ''
67
+ json_resp = nil
68
+
69
+ if pos
70
+ begin
71
+ json_resp = JSON.parse(response)
72
+ html = json_resp['items_html'] || ''
73
+ rescue => e
74
+ logger.warn("Failed to parse JSON #{e.inspect} while requesting #{url}")
75
+ end
76
+ else
77
+ html = response || ''
78
+ end
79
+
80
+ tweets = Tweet.from_html(html)
81
+
82
+ if tweets.empty?
83
+ if json_resp && json_resp['has_more_items']
84
+ pos = json_resp['min_position']
85
+ else
86
+ pos = nil
87
+ end
88
+ return [], pos
89
+ end
90
+
91
+ if json_resp
92
+ [tweets, json_resp['min_position']]
93
+ elsif from_user
94
+ raise NotImplementedError
95
+ else
96
+ [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
97
+ end
98
+ end
99
+
100
+ def query_tweets(query, start_date: nil, end_date: nil, limit: 100, threads: 2, lang: '')
101
+ start_date = start_date ? Date.parse(start_date) : Date.parse('2006-3-21')
102
+ end_date = end_date ? Date.parse(end_date) : Date.today
103
+ if start_date == end_date
104
+ raise 'Please specify different values for :start_date and :end_date.'
105
+ elsif start_date > end_date
106
+ raise 'The :start_date must occur before :end_date.'
107
+ end
108
+
109
+ # TODO parallel
110
+
111
+ pos = nil
112
+ all_tweets = []
113
+
114
+ proxies = Twitterscraper::Proxy.get_proxies
115
+ logger.info "Using #{proxies.size} proxies"
116
+
117
+ headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
118
+ logger.info("Headers #{headers}")
119
+
120
+ start_date.upto(end_date) do |date|
121
+ break if date == end_date
122
+
123
+ queries = query + " since:#{date} until:#{date + 1}"
124
+
125
+ while true
126
+ new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
127
+ logger.info("Got #{new_tweets.size} tweets")
128
+ logger.debug("new_pos=#{new_pos}")
129
+
130
+ unless new_tweets.empty?
131
+ all_tweets.concat(new_tweets)
132
+ all_tweets.uniq! { |t| t.tweet_id }
133
+ end
134
+
135
+ break unless new_pos
136
+ break if all_tweets.size >= limit
137
+
138
+ pos = new_pos
139
+ end
140
+
141
+ break if all_tweets.size >= limit
142
+ end
143
+
144
+ all_tweets
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,41 @@
1
+ require 'time'
2
+
3
+ module Twitterscraper
4
+ class Tweet
5
+ attr_reader :screen_name, :name, :user_id, :tweet_id, :tweet_url, :timestamp, :created_at, :text
6
+
7
+ def initialize(attrs)
8
+ attrs.each do |key, value|
9
+ instance_variable_set("@#{key}", value)
10
+ end
11
+ end
12
+
13
+ class << self
14
+ def from_html(text)
15
+ html = Nokogiri::HTML(text)
16
+ from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
17
+ end
18
+
19
+ def from_tweets_html(html)
20
+ html.map do |tweet|
21
+ from_tweet_html(tweet)
22
+ end
23
+ end
24
+
25
+ def from_tweet_html(html)
26
+ inner_html = Nokogiri::HTML(html.inner_html)
27
+ timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
28
+ new(
29
+ screen_name: html.attr('data-screen-name'),
30
+ name: html.attr('data-name'),
31
+ user_id: html.attr('data-user-id').to_i,
32
+ tweet_id: html.attr('data-tweet-id').to_i,
33
+ tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
34
+ timestamp: timestamp,
35
+ created_at: Time.at(timestamp, in: '+00:00'),
36
+ text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
37
+ )
38
+ end
39
+ end
40
+ end
41
+ end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -24,4 +24,6 @@ Gem::Specification.new do |spec|
24
24
  spec.bindir = "exe"
25
25
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
26
26
  spec.require_paths = ["lib"]
27
+
28
+ spec.add_dependency "nokogiri"
27
29
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-11 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2020-07-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description: A gem to scrape Tweets
14
28
  email:
15
29
  - ts_3156@yahoo.co.jp
@@ -19,6 +33,7 @@ extra_rdoc_files: []
19
33
  files:
20
34
  - ".gitignore"
21
35
  - ".irbrc"
36
+ - ".ruby-version"
22
37
  - ".travis.yml"
23
38
  - CODE_OF_CONDUCT.md
24
39
  - Gemfile
@@ -30,6 +45,13 @@ files:
30
45
  - bin/setup
31
46
  - lib/twitterscraper-ruby.rb
32
47
  - lib/twitterscraper.rb
48
+ - lib/twitterscraper/client.rb
49
+ - lib/twitterscraper/http.rb
50
+ - lib/twitterscraper/lang.rb
51
+ - lib/twitterscraper/logger.rb
52
+ - lib/twitterscraper/proxy.rb
53
+ - lib/twitterscraper/query.rb
54
+ - lib/twitterscraper/tweet.rb
33
55
  - lib/version.rb
34
56
  - twitterscraper-ruby.gemspec
35
57
  homepage: https://github.com/ts-3156/twitterscraper-ruby
@@ -54,8 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
54
76
  - !ruby/object:Gem::Version
55
77
  version: '0'
56
78
  requirements: []
57
- rubyforge_project:
58
- rubygems_version: 2.7.6
79
+ rubygems_version: 3.0.3
59
80
  signing_key:
60
81
  specification_version: 4
61
82
  summary: A gem to scrape Tweets