goldtweets 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/goldtweets.rb +10 -0
- data/lib/goldtweets/client.rb +225 -0
- data/lib/goldtweets/search.rb +56 -0
- data/lib/goldtweets/tweet.rb +4 -0
- metadata +88 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d05e029b4a06fa88926306487069f5778f83a3eb40f6ef714004636319d0ed26
|
4
|
+
data.tar.gz: f645ec888e05149aa30c387f8bd763680ac6869edd643d112d32ef4fec60f91e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a7326365bed9d11762dd2ecc63f32e600dd60545b709decf60a605b0a0716528121ca61ad9ab76841da4d313ad6bf20e18d36a42e3fd19f30aae83a8f97d0418
|
7
|
+
data.tar.gz: f18dfbc3e5bf8155c2213fcf36aa2a3f9fc99908ed51fab79535f32e3bf5c77d9b10e4ac5c8abdd2c98a48978df4e88c072ac267f2a6cb486ade7a6d6e6ea6e8
|
data/lib/goldtweets.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'goldtweets/client'
|
2
|
+
require 'goldtweets/search'
|
3
|
+
require 'goldtweets/tweet'
|
4
|
+
|
5
|
+
module GoldTweets
|
6
|
+
# Convenience method, identical to calling GoldTweets::Client.get_tweets
|
7
|
+
def self.get_tweets(criteria)
|
8
|
+
::GoldTweets::Client.get_tweets(criteria)
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,225 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'net/http'
|
6
|
+
require 'uri'
|
7
|
+
|
8
|
+
require 'goldtweets/tweet'
|
9
|
+
|
10
|
+
module GoldTweets
|
11
|
+
module Client
|
12
|
+
# User agents to present to Twitter search
|
13
|
+
USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
14
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
15
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0',
|
16
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
17
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
18
|
+
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
19
|
+
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
20
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'
|
21
|
+
].freeze
|
22
|
+
|
23
|
+
# Static list of headers to be sent with API requests
|
24
|
+
DEFAULT_HEADERS = { 'Host' => 'twitter.com',
|
25
|
+
'Accept' => 'application/json, text/javascript, */*; q=0.01',
|
26
|
+
'Accept-Language' => 'en-US,en;q=0.5',
|
27
|
+
'X-Requested-With' => 'XMLHttpRequest',
|
28
|
+
'Connection' => 'keep-alive'
|
29
|
+
}.freeze
|
30
|
+
# How many usernames to put in a single search
|
31
|
+
USERNAMES_PER_BATCH = 20
|
32
|
+
|
33
|
+
# URLs for searching and generating permalinks back to tweets
|
34
|
+
SEARCH_PREFIX = 'https://twitter.com/i/search/timeline?'
|
35
|
+
PERMALINK_PREFIX = 'https://twitter.com'
|
36
|
+
|
37
|
+
# Static list of parameters sent with a search
|
38
|
+
DEFAULT_PARAMETERS = { 'vertical' => 'news',
|
39
|
+
'src' => 'typd',
|
40
|
+
'include_available_features' => '1',
|
41
|
+
'include_entities' => '1',
|
42
|
+
'reset_error_state' => 'false'
|
43
|
+
}.freeze
|
44
|
+
|
45
|
+
# XPath selectors
|
46
|
+
TWEETS_SELECTOR = "//div[contains(concat(' ', normalize-space(@class), ' '), ' js-stream-tweet ') and not(contains(concat(' ', normalize-space(@class), ' '), ' withheld-tweet '))]"
|
47
|
+
USERNAMES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' username ') and contains(concat(' ', normalize-space(@class), ' '), ' u-dir ')]/b"
|
48
|
+
AUTHORID_SELECTOR = ".//a[contains(concat(' ', normalize-space(@class), ' '), ' js-user-profile-link ')]"
|
49
|
+
CONTENT_SELECTOR = ".//p[contains(concat(' ', normalize-space(@class), ' '), ' js-tweet-text ')]"
|
50
|
+
RETWEETS_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--retweet ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
51
|
+
FAVORITES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--favorite ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
52
|
+
REPLIES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--reply ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
53
|
+
TIMESTAMP_SELECTOR = ".//small[contains(concat(' ', normalize-space(@class), ' '), ' time ')]//span[contains(concat(' ', normalize-space(@class), ' '), ' js-short-timestamp ')]"
|
54
|
+
GEO_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' Tweet-geo ')]"
|
55
|
+
LINK_SELECTOR = ".//a"
|
56
|
+
|
57
|
+
# Interim response structure useful for tweet fetch and processing logic
|
58
|
+
Response = Struct.new(:body, :new_cursor, :new_cookies, :more_items)
|
59
|
+
|
60
|
+
# Fetch tweets based on a GoldTweets::Search object
|
61
|
+
# This functionality is presently lacking several features of the original
|
62
|
+
# python library - proxy support, emoji handling, and allowing a provided
|
63
|
+
# block to be run on tweets as they are processed among them.
|
64
|
+
def self.get_tweets(criteria)
|
65
|
+
user_agent = USER_AGENTS.sample
|
66
|
+
cookie_jar = ''
|
67
|
+
usernames = usernames_for(criteria.usernames)
|
68
|
+
batches = usernames.each_slice(USERNAMES_PER_BATCH).to_a
|
69
|
+
|
70
|
+
batches.map do |batch|
|
71
|
+
refresh_cursor = ''
|
72
|
+
batch_results_count = 0
|
73
|
+
collected_tweets = []
|
74
|
+
|
75
|
+
criteria.usernames = batch
|
76
|
+
loop do
|
77
|
+
response = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
|
78
|
+
cookie_jar = response.new_cookies if response.new_cookies
|
79
|
+
refresh_cursor = response.new_cursor
|
80
|
+
|
81
|
+
tweets = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet))
|
82
|
+
collected_tweets << tweets
|
83
|
+
batch_results_count += tweets.length
|
84
|
+
|
85
|
+
if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items)
|
86
|
+
break
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
collected_tweets.flatten
|
91
|
+
end.flatten
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
# Coerce usernames into a suitable representation for batching
|
97
|
+
def self.usernames_for(users)
|
98
|
+
case users
|
99
|
+
when Array
|
100
|
+
users.map { |u| u.sub(/^@/, '').downcase }
|
101
|
+
when String
|
102
|
+
[ users.sub(/^@/, '').downcase ]
|
103
|
+
else
|
104
|
+
[[]]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Function for folding a list of Nokogiri objects fetched from Twitter into
|
109
|
+
# a list of GoldTweets::Tweet objects
|
110
|
+
def self.parse_tweet(tweets, tweet)
|
111
|
+
users = tweet.xpath(USERNAMES_SELECTOR).map(&:text)
|
112
|
+
return tweets if users.empty?
|
113
|
+
|
114
|
+
message = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first
|
115
|
+
rt,f,re = tweet_interactions(tweet)
|
116
|
+
permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path')
|
117
|
+
author = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first
|
118
|
+
timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first
|
119
|
+
links = tweet.xpath(LINK_SELECTOR)
|
120
|
+
hts, ats = tweet_hashtags_and_mentions(links)
|
121
|
+
geo_span = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s
|
122
|
+
ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself)
|
123
|
+
|
124
|
+
tweet_container = ::GoldTweets::Tweet.new(users.first)
|
125
|
+
tweet_container.to = users[1]
|
126
|
+
tweet_container.text = message
|
127
|
+
tweet_container.retweets = rt
|
128
|
+
tweet_container.faves = f
|
129
|
+
tweet_container.replies = re
|
130
|
+
tweet_container.id = tweet.attr('data-tweet-id')
|
131
|
+
tweet_container.permalink = permalink
|
132
|
+
tweet_container.author_id = author
|
133
|
+
tweet_container.timestamp = timestamp
|
134
|
+
tweet_container.hashtags = hts
|
135
|
+
tweet_container.mentions = ats
|
136
|
+
tweet_container.geo = geo_span
|
137
|
+
tweet_container.links = ext_links
|
138
|
+
|
139
|
+
tweets + [tweet_container]
|
140
|
+
end
|
141
|
+
|
142
|
+
# Normalize spacing and remove errant spaces following pound signs, at
|
143
|
+
# signs, and dollar signs
|
144
|
+
def self.sanitize_message(tweet)
|
145
|
+
tweet.text
|
146
|
+
.gsub(/\s+/, ' ')
|
147
|
+
.gsub(/([#@\$]) /, '\1')
|
148
|
+
end
|
149
|
+
|
150
|
+
# Classify interactions (retweets, faves, and replies to a given tweet)
|
151
|
+
def self.tweet_interactions(tweet)
|
152
|
+
[RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector|
|
153
|
+
tweet.xpath(selector)
|
154
|
+
.map { |node| node.attr('data-tweet-stat-count') }
|
155
|
+
.first
|
156
|
+
.to_i
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Classify links belonging to hashtags and (outgoing) mentions within a
|
161
|
+
# tweet
|
162
|
+
def self.tweet_hashtags_and_mentions(links)
|
163
|
+
links.reduce([[], []]) do |(hashtags, mentions), link|
|
164
|
+
href = link.attr('href')
|
165
|
+
return [hashtags, mentions] unless href.to_s[0] == '/'
|
166
|
+
if link.attr('data-mentioned-user-id')
|
167
|
+
[hashtags, mentions + ['@' + href[1..-1]]]
|
168
|
+
elsif /^\/hashtag\//.match(href)
|
169
|
+
[hashtags + [href.sub(/(?:^\/hashtag\/)/, '#').sub(/(?:\?.*$)/, '')], mentions]
|
170
|
+
else
|
171
|
+
[hashtags, mentions]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# Perform a search for tweets based on criteria specified
|
177
|
+
def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
|
178
|
+
search = DEFAULT_PARAMETERS.dup
|
179
|
+
get_data = []
|
180
|
+
search['f'] = 'tweets' unless criteria.top_tweets?
|
181
|
+
search['l'] = criteria.language if criteria.language
|
182
|
+
|
183
|
+
get_data << criteria.query if criteria.query
|
184
|
+
get_data << ([''] + criteria.exclude_words).join(' -')
|
185
|
+
get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username
|
186
|
+
get_data << "since:#{criteria.since}" if criteria.since
|
187
|
+
get_data << "until:#{criteria.upto}" if criteria.upto
|
188
|
+
get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies
|
189
|
+
get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves
|
190
|
+
get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets
|
191
|
+
|
192
|
+
if criteria.maximum_distance
|
193
|
+
if criteria.near
|
194
|
+
get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}"
|
195
|
+
elsif criteria.lat && criteria.lon
|
196
|
+
get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
search['q'] = get_data.join(' ').strip
|
201
|
+
search['max_position'] = refresh_cursor
|
202
|
+
|
203
|
+
url = SEARCH_PREFIX + URI.encode_www_form(search)
|
204
|
+
uri = URI(url)
|
205
|
+
|
206
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
|
207
|
+
request = Net::HTTP::Get.new(uri)
|
208
|
+
DEFAULT_HEADERS.each { |(k,v)| request[k] = v }
|
209
|
+
request['User-Agent'] = user_agent
|
210
|
+
request['Referer'] = url
|
211
|
+
request['Set-Cookie'] = cookie_jar
|
212
|
+
|
213
|
+
response = http.request(request)
|
214
|
+
|
215
|
+
json = JSON.parse(response.body)
|
216
|
+
html = Nokogiri::HTML(json['items_html'])
|
217
|
+
new_cursor = json['min_position']
|
218
|
+
new_cookies = response['set-cookie']
|
219
|
+
unfinished = json['has_more_items']
|
220
|
+
|
221
|
+
return Response.new(html, new_cursor, new_cookies, unfinished)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module GoldTweets
|
2
|
+
Search = Struct.new(:username,
|
3
|
+
:exclude_words,
|
4
|
+
:since,
|
5
|
+
:upto,
|
6
|
+
:minimum_replies,
|
7
|
+
:minimum_faves,
|
8
|
+
:minimum_retweets,
|
9
|
+
:maximum_distance,
|
10
|
+
:near,
|
11
|
+
:lat,
|
12
|
+
:lon,
|
13
|
+
:query,
|
14
|
+
:maximum_tweets,
|
15
|
+
:language,
|
16
|
+
:emoji,
|
17
|
+
:top_tweets,
|
18
|
+
keyword_init: true) do
|
19
|
+
|
20
|
+
# Set default values, otherwise no additional work done here.
|
21
|
+
def initialize(username: nil,
|
22
|
+
exclude_words: [],
|
23
|
+
since: nil,
|
24
|
+
upto: nil,
|
25
|
+
minimum_replies: nil,
|
26
|
+
minimum_faves: nil,
|
27
|
+
minimum_retweets: nil,
|
28
|
+
maximum_distance: '15mi',
|
29
|
+
near: nil,
|
30
|
+
lat: nil,
|
31
|
+
lon: nil,
|
32
|
+
query: nil,
|
33
|
+
maximum_tweets: 0,
|
34
|
+
language: '',
|
35
|
+
emoji: :ignore,
|
36
|
+
top_tweets: false)
|
37
|
+
username = username
|
38
|
+
exclude_words = exclude_words
|
39
|
+
since = since
|
40
|
+
upto = upto
|
41
|
+
minimum_replies = minimum_replies
|
42
|
+
minimum_retweets = minimum_retweets
|
43
|
+
maximum_distance = maximum_distance
|
44
|
+
query = query
|
45
|
+
maximum_tweets = maximum_tweets
|
46
|
+
language = language
|
47
|
+
emoji = emoji
|
48
|
+
top_tweets = top_tweets
|
49
|
+
super
|
50
|
+
end
|
51
|
+
|
52
|
+
alias_method :usernames=, :username=
|
53
|
+
alias_method :usernames, :username
|
54
|
+
alias_method :top_tweets?, :top_tweets
|
55
|
+
end
|
56
|
+
end
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: goldtweets
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tina Wuest
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-07-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.10'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.10'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '13'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '13'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5'
|
55
|
+
description: Ruby port of the Python GetOldTweets3 Twitter library
|
56
|
+
email: tina@wuest.me
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files: []
|
60
|
+
files:
|
61
|
+
- lib/goldtweets.rb
|
62
|
+
- lib/goldtweets/client.rb
|
63
|
+
- lib/goldtweets/search.rb
|
64
|
+
- lib/goldtweets/tweet.rb
|
65
|
+
homepage: https://gitlab.com/wuest/goldtweets
|
66
|
+
licenses:
|
67
|
+
- MIT
|
68
|
+
metadata: {}
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 2.5.0
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubygems_version: 3.1.2
|
85
|
+
signing_key:
|
86
|
+
specification_version: 4
|
87
|
+
summary: Search Twitter including older tweets
|
88
|
+
test_files: []
|