goldtweets 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/goldtweets.rb +10 -0
- data/lib/goldtweets/client.rb +225 -0
- data/lib/goldtweets/search.rb +56 -0
- data/lib/goldtweets/tweet.rb +4 -0
- metadata +88 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d05e029b4a06fa88926306487069f5778f83a3eb40f6ef714004636319d0ed26
|
4
|
+
data.tar.gz: f645ec888e05149aa30c387f8bd763680ac6869edd643d112d32ef4fec60f91e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a7326365bed9d11762dd2ecc63f32e600dd60545b709decf60a605b0a0716528121ca61ad9ab76841da4d313ad6bf20e18d36a42e3fd19f30aae83a8f97d0418
|
7
|
+
data.tar.gz: f18dfbc3e5bf8155c2213fcf36aa2a3f9fc99908ed51fab79535f32e3bf5c77d9b10e4ac5c8abdd2c98a48978df4e88c072ac267f2a6cb486ade7a6d6e6ea6e8
|
data/lib/goldtweets.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'goldtweets/client'
|
2
|
+
require 'goldtweets/search'
|
3
|
+
require 'goldtweets/tweet'
|
4
|
+
|
5
|
+
module GoldTweets
|
6
|
+
# Convenience method, identical to calling GoldTweets::Client.get_tweets
|
7
|
+
def self.get_tweets(criteria)
|
8
|
+
::GoldTweets::Client.get_tweets(criteria)
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,225 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'net/http'
|
6
|
+
require 'uri'
|
7
|
+
|
8
|
+
require 'goldtweets/tweet'
|
9
|
+
|
10
|
+
module GoldTweets
|
11
|
+
module Client
|
12
|
+
# User agents to present to Twitter search
|
13
|
+
USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
14
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
15
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0',
|
16
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
17
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
18
|
+
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
19
|
+
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
20
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'
|
21
|
+
].freeze
|
22
|
+
|
23
|
+
# Static list of headers to be sent with API requests
|
24
|
+
DEFAULT_HEADERS = { 'Host' => 'twitter.com',
|
25
|
+
'Accept' => 'application/json, text/javascript, */*; q=0.01',
|
26
|
+
'Accept-Language' => 'en-US,en;q=0.5',
|
27
|
+
'X-Requested-With' => 'XMLHttpRequest',
|
28
|
+
'Connection' => 'keep-alive'
|
29
|
+
}.freeze
|
30
|
+
# How many usernames to put in a single search
|
31
|
+
USERNAMES_PER_BATCH = 20
|
32
|
+
|
33
|
+
# URLs for searching and generating permalinks back to tweets
|
34
|
+
SEARCH_PREFIX = 'https://twitter.com/i/search/timeline?'
|
35
|
+
PERMALINK_PREFIX = 'https://twitter.com'
|
36
|
+
|
37
|
+
# Static list of parameters sent with a search
|
38
|
+
DEFAULT_PARAMETERS = { 'vertical' => 'news',
|
39
|
+
'src' => 'typd',
|
40
|
+
'include_available_features' => '1',
|
41
|
+
'include_entities' => '1',
|
42
|
+
'reset_error_state' => 'false'
|
43
|
+
}.freeze
|
44
|
+
|
45
|
+
# XPath selectors
|
46
|
+
TWEETS_SELECTOR = "//div[contains(concat(' ', normalize-space(@class), ' '), ' js-stream-tweet ') and not(contains(concat(' ', normalize-space(@class), ' '), ' withheld-tweet '))]"
|
47
|
+
USERNAMES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' username ') and contains(concat(' ', normalize-space(@class), ' '), ' u-dir ')]/b"
|
48
|
+
AUTHORID_SELECTOR = ".//a[contains(concat(' ', normalize-space(@class), ' '), ' js-user-profile-link ')]"
|
49
|
+
CONTENT_SELECTOR = ".//p[contains(concat(' ', normalize-space(@class), ' '), ' js-tweet-text ')]"
|
50
|
+
RETWEETS_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--retweet ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
51
|
+
FAVORITES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--favorite ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
52
|
+
REPLIES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--reply ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
|
53
|
+
TIMESTAMP_SELECTOR = ".//small[contains(concat(' ', normalize-space(@class), ' '), ' time ')]//span[contains(concat(' ', normalize-space(@class), ' '), ' js-short-timestamp ')]"
|
54
|
+
GEO_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' Tweet-geo ')]"
|
55
|
+
LINK_SELECTOR = ".//a"
|
56
|
+
|
57
|
+
# Interim response structure useful for tweet fetch and processing logic
|
58
|
+
Response = Struct.new(:body, :new_cursor, :new_cookies, :more_items)
|
59
|
+
|
60
|
+
# Fetch tweets based on a GoldTweets::Search object
|
61
|
+
# This functionality is presently lacking several features of the original
|
62
|
+
# python library - proxy support, emoji handling, and allowing a provided
|
63
|
+
# block to be run on tweets as they are processed among them.
|
64
|
+
def self.get_tweets(criteria)
|
65
|
+
user_agent = USER_AGENTS.sample
|
66
|
+
cookie_jar = ''
|
67
|
+
usernames = usernames_for(criteria.usernames)
|
68
|
+
batches = usernames.each_slice(USERNAMES_PER_BATCH).to_a
|
69
|
+
|
70
|
+
batches.map do |batch|
|
71
|
+
refresh_cursor = ''
|
72
|
+
batch_results_count = 0
|
73
|
+
collected_tweets = []
|
74
|
+
|
75
|
+
criteria.usernames = batch
|
76
|
+
loop do
|
77
|
+
response = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
|
78
|
+
cookie_jar = response.new_cookies if response.new_cookies
|
79
|
+
refresh_cursor = response.new_cursor
|
80
|
+
|
81
|
+
tweets = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet))
|
82
|
+
collected_tweets << tweets
|
83
|
+
batch_results_count += tweets.length
|
84
|
+
|
85
|
+
if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items)
|
86
|
+
break
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
collected_tweets.flatten
|
91
|
+
end.flatten
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
# Coerce usernames into a suitable representation for batching
|
97
|
+
def self.usernames_for(users)
|
98
|
+
case users
|
99
|
+
when Array
|
100
|
+
users.map { |u| u.sub(/^@/, '').downcase }
|
101
|
+
when String
|
102
|
+
[ users.sub(/^@/, '').downcase ]
|
103
|
+
else
|
104
|
+
[[]]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Function for folding a list of Nokogiri objects fetched from Twitter into
|
109
|
+
# a list of GoldTweets::Tweet objects
|
110
|
+
def self.parse_tweet(tweets, tweet)
|
111
|
+
users = tweet.xpath(USERNAMES_SELECTOR).map(&:text)
|
112
|
+
return tweets if users.empty?
|
113
|
+
|
114
|
+
message = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first
|
115
|
+
rt,f,re = tweet_interactions(tweet)
|
116
|
+
permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path')
|
117
|
+
author = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first
|
118
|
+
timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first
|
119
|
+
links = tweet.xpath(LINK_SELECTOR)
|
120
|
+
hts, ats = tweet_hashtags_and_mentions(links)
|
121
|
+
geo_span = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s
|
122
|
+
ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself)
|
123
|
+
|
124
|
+
tweet_container = ::GoldTweets::Tweet.new(users.first)
|
125
|
+
tweet_container.to = users[1]
|
126
|
+
tweet_container.text = message
|
127
|
+
tweet_container.retweets = rt
|
128
|
+
tweet_container.faves = f
|
129
|
+
tweet_container.replies = re
|
130
|
+
tweet_container.id = tweet.attr('data-tweet-id')
|
131
|
+
tweet_container.permalink = permalink
|
132
|
+
tweet_container.author_id = author
|
133
|
+
tweet_container.timestamp = timestamp
|
134
|
+
tweet_container.hashtags = hts
|
135
|
+
tweet_container.mentions = ats
|
136
|
+
tweet_container.geo = geo_span
|
137
|
+
tweet_container.links = ext_links
|
138
|
+
|
139
|
+
tweets + [tweet_container]
|
140
|
+
end
|
141
|
+
|
142
|
+
# Normalize spacing and remove errant spaces following pound signs, at
|
143
|
+
# signs, and dollar signs
|
144
|
+
def self.sanitize_message(tweet)
|
145
|
+
tweet.text
|
146
|
+
.gsub(/\s+/, ' ')
|
147
|
+
.gsub(/([#@\$]) /, '\1')
|
148
|
+
end
|
149
|
+
|
150
|
+
# Classify interactions (retweets, faves, and replies to a given tweet)
|
151
|
+
def self.tweet_interactions(tweet)
|
152
|
+
[RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector|
|
153
|
+
tweet.xpath(selector)
|
154
|
+
.map { |node| node.attr('data-tweet-stat-count') }
|
155
|
+
.first
|
156
|
+
.to_i
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Classify links belonging to hashtags and (outgoing) mentions within a
|
161
|
+
# tweet
|
162
|
+
def self.tweet_hashtags_and_mentions(links)
|
163
|
+
links.reduce([[], []]) do |(hashtags, mentions), link|
|
164
|
+
href = link.attr('href')
|
165
|
+
return [hashtags, mentions] unless href.to_s[0] == '/'
|
166
|
+
if link.attr('data-mentioned-user-id')
|
167
|
+
[hashtags, mentions + ['@' + href[1..-1]]]
|
168
|
+
elsif /^\/hashtag\//.match(href)
|
169
|
+
[hashtags + [href.sub(/(?:^\/hashtag\/)/, '#').sub(/(?:\?.*$)/, '')], mentions]
|
170
|
+
else
|
171
|
+
[hashtags, mentions]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# Perform a search for tweets based on criteria specified
|
177
|
+
def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
|
178
|
+
search = DEFAULT_PARAMETERS.dup
|
179
|
+
get_data = []
|
180
|
+
search['f'] = 'tweets' unless criteria.top_tweets?
|
181
|
+
search['l'] = criteria.language if criteria.language
|
182
|
+
|
183
|
+
get_data << criteria.query if criteria.query
|
184
|
+
get_data << ([''] + criteria.exclude_words).join(' -')
|
185
|
+
get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username
|
186
|
+
get_data << "since:#{criteria.since}" if criteria.since
|
187
|
+
get_data << "until:#{criteria.upto}" if criteria.upto
|
188
|
+
get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies
|
189
|
+
get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves
|
190
|
+
get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets
|
191
|
+
|
192
|
+
if criteria.maximum_distance
|
193
|
+
if criteria.near
|
194
|
+
get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}"
|
195
|
+
elsif criteria.lat && criteria.lon
|
196
|
+
get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
search['q'] = get_data.join(' ').strip
|
201
|
+
search['max_position'] = refresh_cursor
|
202
|
+
|
203
|
+
url = SEARCH_PREFIX + URI.encode_www_form(search)
|
204
|
+
uri = URI(url)
|
205
|
+
|
206
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
|
207
|
+
request = Net::HTTP::Get.new(uri)
|
208
|
+
DEFAULT_HEADERS.each { |(k,v)| request[k] = v }
|
209
|
+
request['User-Agent'] = user_agent
|
210
|
+
request['Referer'] = url
|
211
|
+
request['Set-Cookie'] = cookie_jar
|
212
|
+
|
213
|
+
response = http.request(request)
|
214
|
+
|
215
|
+
json = JSON.parse(response.body)
|
216
|
+
html = Nokogiri::HTML(json['items_html'])
|
217
|
+
new_cursor = json['min_position']
|
218
|
+
new_cookies = response['set-cookie']
|
219
|
+
unfinished = json['has_more_items']
|
220
|
+
|
221
|
+
return Response.new(html, new_cursor, new_cookies, unfinished)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module GoldTweets
|
2
|
+
Search = Struct.new(:username,
|
3
|
+
:exclude_words,
|
4
|
+
:since,
|
5
|
+
:upto,
|
6
|
+
:minimum_replies,
|
7
|
+
:minimum_faves,
|
8
|
+
:minimum_retweets,
|
9
|
+
:maximum_distance,
|
10
|
+
:near,
|
11
|
+
:lat,
|
12
|
+
:lon,
|
13
|
+
:query,
|
14
|
+
:maximum_tweets,
|
15
|
+
:language,
|
16
|
+
:emoji,
|
17
|
+
:top_tweets,
|
18
|
+
keyword_init: true) do
|
19
|
+
|
20
|
+
# Set default values, otherwise no additional work done here.
|
21
|
+
def initialize(username: nil,
|
22
|
+
exclude_words: [],
|
23
|
+
since: nil,
|
24
|
+
upto: nil,
|
25
|
+
minimum_replies: nil,
|
26
|
+
minimum_faves: nil,
|
27
|
+
minimum_retweets: nil,
|
28
|
+
maximum_distance: '15mi',
|
29
|
+
near: nil,
|
30
|
+
lat: nil,
|
31
|
+
lon: nil,
|
32
|
+
query: nil,
|
33
|
+
maximum_tweets: 0,
|
34
|
+
language: '',
|
35
|
+
emoji: :ignore,
|
36
|
+
top_tweets: false)
|
37
|
+
username = username
|
38
|
+
exclude_words = exclude_words
|
39
|
+
since = since
|
40
|
+
upto = upto
|
41
|
+
minimum_replies = minimum_replies
|
42
|
+
minimum_retweets = minimum_retweets
|
43
|
+
maximum_distance = maximum_distance
|
44
|
+
query = query
|
45
|
+
maximum_tweets = maximum_tweets
|
46
|
+
language = language
|
47
|
+
emoji = emoji
|
48
|
+
top_tweets = top_tweets
|
49
|
+
super
|
50
|
+
end
|
51
|
+
|
52
|
+
alias_method :usernames=, :username=
|
53
|
+
alias_method :usernames, :username
|
54
|
+
alias_method :top_tweets?, :top_tweets
|
55
|
+
end
|
56
|
+
end
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: goldtweets
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tina Wuest
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-07-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.10'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.10'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '13'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '13'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5'
|
55
|
+
description: Ruby port of the Python GetOldTweets3 Twitter library
|
56
|
+
email: tina@wuest.me
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files: []
|
60
|
+
files:
|
61
|
+
- lib/goldtweets.rb
|
62
|
+
- lib/goldtweets/client.rb
|
63
|
+
- lib/goldtweets/search.rb
|
64
|
+
- lib/goldtweets/tweet.rb
|
65
|
+
homepage: https://gitlab.com/wuest/goldtweets
|
66
|
+
licenses:
|
67
|
+
- MIT
|
68
|
+
metadata: {}
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 2.5.0
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubygems_version: 3.1.2
|
85
|
+
signing_key:
|
86
|
+
specification_version: 4
|
87
|
+
summary: Search Twitter including older tweets
|
88
|
+
test_files: []
|