goldtweets 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d05e029b4a06fa88926306487069f5778f83a3eb40f6ef714004636319d0ed26
4
+ data.tar.gz: f645ec888e05149aa30c387f8bd763680ac6869edd643d112d32ef4fec60f91e
5
+ SHA512:
6
+ metadata.gz: a7326365bed9d11762dd2ecc63f32e600dd60545b709decf60a605b0a0716528121ca61ad9ab76841da4d313ad6bf20e18d36a42e3fd19f30aae83a8f97d0418
7
+ data.tar.gz: f18dfbc3e5bf8155c2213fcf36aa2a3f9fc99908ed51fab79535f32e3bf5c77d9b10e4ac5c8abdd2c98a48978df4e88c072ac267f2a6cb486ade7a6d6e6ea6e8
@@ -0,0 +1,10 @@
1
+ require 'goldtweets/client'
2
+ require 'goldtweets/search'
3
+ require 'goldtweets/tweet'
4
+
5
+ module GoldTweets
6
+ # Convenience method, identical to calling GoldTweets::Client.get_tweets
7
+ def self.get_tweets(criteria)
8
+ ::GoldTweets::Client.get_tweets(criteria)
9
+ end
10
+ end
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'net/http'
6
+ require 'uri'
7
+
8
+ require 'goldtweets/tweet'
9
+
10
+ module GoldTweets
11
+ module Client
12
+ # User agents to present to Twitter search
13
+ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
14
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
15
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0',
16
+ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
17
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
18
+ 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
19
+ 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
20
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'
21
+ ].freeze
22
+
23
+ # Static list of headers to be sent with API requests
24
+ DEFAULT_HEADERS = { 'Host' => 'twitter.com',
25
+ 'Accept' => 'application/json, text/javascript, */*; q=0.01',
26
+ 'Accept-Language' => 'en-US,en;q=0.5',
27
+ 'X-Requested-With' => 'XMLHttpRequest',
28
+ 'Connection' => 'keep-alive'
29
+ }.freeze
30
+ # How many usernames to put in a single search
31
+ USERNAMES_PER_BATCH = 20
32
+
33
+ # URLs for searching and generating permalinks back to tweets
34
+ SEARCH_PREFIX = 'https://twitter.com/i/search/timeline?'
35
+ PERMALINK_PREFIX = 'https://twitter.com'
36
+
37
+ # Static list of parameters sent with a search
38
+ DEFAULT_PARAMETERS = { 'vertical' => 'news',
39
+ 'src' => 'typd',
40
+ 'include_available_features' => '1',
41
+ 'include_entities' => '1',
42
+ 'reset_error_state' => 'false'
43
+ }.freeze
44
+
45
+ # XPath selectors
46
+ TWEETS_SELECTOR = "//div[contains(concat(' ', normalize-space(@class), ' '), ' js-stream-tweet ') and not(contains(concat(' ', normalize-space(@class), ' '), ' withheld-tweet '))]"
47
+ USERNAMES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' username ') and contains(concat(' ', normalize-space(@class), ' '), ' u-dir ')]/b"
48
+ AUTHORID_SELECTOR = ".//a[contains(concat(' ', normalize-space(@class), ' '), ' js-user-profile-link ')]"
49
+ CONTENT_SELECTOR = ".//p[contains(concat(' ', normalize-space(@class), ' '), ' js-tweet-text ')]"
50
+ RETWEETS_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--retweet ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
51
+ FAVORITES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--favorite ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
52
+ REPLIES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--reply ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
53
+ TIMESTAMP_SELECTOR = ".//small[contains(concat(' ', normalize-space(@class), ' '), ' time ')]//span[contains(concat(' ', normalize-space(@class), ' '), ' js-short-timestamp ')]"
54
+ GEO_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' Tweet-geo ')]"
55
+ LINK_SELECTOR = ".//a"
56
+
57
+ # Interim response structure useful for tweet fetch and processing logic
58
+ Response = Struct.new(:body, :new_cursor, :new_cookies, :more_items)
59
+
60
+ # Fetch tweets based on a GoldTweets::Search object
61
+ # This functionality is presently lacking several features of the original
62
+ # python library - proxy support, emoji handling, and allowing a provided
63
+ # block to be run on tweets as they are processed among them.
64
+ def self.get_tweets(criteria)
65
+ user_agent = USER_AGENTS.sample
66
+ cookie_jar = ''
67
+ usernames = usernames_for(criteria.usernames)
68
+ batches = usernames.each_slice(USERNAMES_PER_BATCH).to_a
69
+
70
+ batches.map do |batch|
71
+ refresh_cursor = ''
72
+ batch_results_count = 0
73
+ collected_tweets = []
74
+
75
+ criteria.usernames = batch
76
+ loop do
77
+ response = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
78
+ cookie_jar = response.new_cookies if response.new_cookies
79
+ refresh_cursor = response.new_cursor
80
+
81
+ tweets = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet))
82
+ collected_tweets << tweets
83
+ batch_results_count += tweets.length
84
+
85
+ if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items)
86
+ break
87
+ end
88
+ end
89
+
90
+ collected_tweets.flatten
91
+ end.flatten
92
+ end
93
+
94
+ private
95
+
96
+ # Coerce usernames into a suitable representation for batching
97
+ def self.usernames_for(users)
98
+ case users
99
+ when Array
100
+ users.map { |u| u.sub(/^@/, '').downcase }
101
+ when String
102
+ [ users.sub(/^@/, '').downcase ]
103
+ else
104
+ [[]]
105
+ end
106
+ end
107
+
108
+ # Function for folding a list of Nokogiri objects fetched from Twitter into
109
+ # a list of GoldTweets::Tweet objects
110
+ def self.parse_tweet(tweets, tweet)
111
+ users = tweet.xpath(USERNAMES_SELECTOR).map(&:text)
112
+ return tweets if users.empty?
113
+
114
+ message = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first
115
+ rt,f,re = tweet_interactions(tweet)
116
+ permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path')
117
+ author = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first
118
+ timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first
119
+ links = tweet.xpath(LINK_SELECTOR)
120
+ hts, ats = tweet_hashtags_and_mentions(links)
121
+ geo_span = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s
122
+ ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself)
123
+
124
+ tweet_container = ::GoldTweets::Tweet.new(users.first)
125
+ tweet_container.to = users[1]
126
+ tweet_container.text = message
127
+ tweet_container.retweets = rt
128
+ tweet_container.faves = f
129
+ tweet_container.replies = re
130
+ tweet_container.id = tweet.attr('data-tweet-id')
131
+ tweet_container.permalink = permalink
132
+ tweet_container.author_id = author
133
+ tweet_container.timestamp = timestamp
134
+ tweet_container.hashtags = hts
135
+ tweet_container.mentions = ats
136
+ tweet_container.geo = geo_span
137
+ tweet_container.links = ext_links
138
+
139
+ tweets + [tweet_container]
140
+ end
141
+
142
+ # Normalize spacing and remove errant spaces following pound signs, at
143
+ # signs, and dollar signs
144
+ def self.sanitize_message(tweet)
145
+ tweet.text
146
+ .gsub(/\s+/, ' ')
147
+ .gsub(/([#@\$]) /, '\1')
148
+ end
149
+
150
+ # Classify interactions (retweets, faves, and replies to a given tweet)
151
+ def self.tweet_interactions(tweet)
152
+ [RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector|
153
+ tweet.xpath(selector)
154
+ .map { |node| node.attr('data-tweet-stat-count') }
155
+ .first
156
+ .to_i
157
+ end
158
+ end
159
+
160
+ # Classify links belonging to hashtags and (outgoing) mentions within a
161
+ # tweet
162
+ def self.tweet_hashtags_and_mentions(links)
163
+ links.reduce([[], []]) do |(hashtags, mentions), link|
164
+ href = link.attr('href')
165
+ return [hashtags, mentions] unless href.to_s[0] == '/'
166
+ if link.attr('data-mentioned-user-id')
167
+ [hashtags, mentions + ['@' + href[1..-1]]]
168
+ elsif /^\/hashtag\//.match(href)
169
+ [hashtags + [href.sub(/(?:^\/hashtag\/)/, '#').sub(/(?:\?.*$)/, '')], mentions]
170
+ else
171
+ [hashtags, mentions]
172
+ end
173
+ end
174
+ end
175
+
176
+ # Perform a search for tweets based on criteria specified
177
+ def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
178
+ search = DEFAULT_PARAMETERS.dup
179
+ get_data = []
180
+ search['f'] = 'tweets' unless criteria.top_tweets?
181
+ search['l'] = criteria.language if criteria.language
182
+
183
+ get_data << criteria.query if criteria.query
184
+ get_data << ([''] + criteria.exclude_words).join(' -')
185
+ get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username
186
+ get_data << "since:#{criteria.since}" if criteria.since
187
+ get_data << "until:#{criteria.upto}" if criteria.upto
188
+ get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies
189
+ get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves
190
+ get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets
191
+
192
+ if criteria.maximum_distance
193
+ if criteria.near
194
+ get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}"
195
+ elsif criteria.lat && criteria.lon
196
+ get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}"
197
+ end
198
+ end
199
+
200
+ search['q'] = get_data.join(' ').strip
201
+ search['max_position'] = refresh_cursor
202
+
203
+ url = SEARCH_PREFIX + URI.encode_www_form(search)
204
+ uri = URI(url)
205
+
206
+ Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
207
+ request = Net::HTTP::Get.new(uri)
208
+ DEFAULT_HEADERS.each { |(k,v)| request[k] = v }
209
+ request['User-Agent'] = user_agent
210
+ request['Referer'] = url
211
+ request['Set-Cookie'] = cookie_jar
212
+
213
+ response = http.request(request)
214
+
215
+ json = JSON.parse(response.body)
216
+ html = Nokogiri::HTML(json['items_html'])
217
+ new_cursor = json['min_position']
218
+ new_cookies = response['set-cookie']
219
+ unfinished = json['has_more_items']
220
+
221
+ return Response.new(html, new_cursor, new_cookies, unfinished)
222
+ end
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,56 @@
1
+ module GoldTweets
2
+ Search = Struct.new(:username,
3
+ :exclude_words,
4
+ :since,
5
+ :upto,
6
+ :minimum_replies,
7
+ :minimum_faves,
8
+ :minimum_retweets,
9
+ :maximum_distance,
10
+ :near,
11
+ :lat,
12
+ :lon,
13
+ :query,
14
+ :maximum_tweets,
15
+ :language,
16
+ :emoji,
17
+ :top_tweets,
18
+ keyword_init: true) do
19
+
20
+ # Set default values, otherwise no additional work done here.
21
+ def initialize(username: nil,
22
+ exclude_words: [],
23
+ since: nil,
24
+ upto: nil,
25
+ minimum_replies: nil,
26
+ minimum_faves: nil,
27
+ minimum_retweets: nil,
28
+ maximum_distance: '15mi',
29
+ near: nil,
30
+ lat: nil,
31
+ lon: nil,
32
+ query: nil,
33
+ maximum_tweets: 0,
34
+ language: '',
35
+ emoji: :ignore,
36
+ top_tweets: false)
37
+ username = username
38
+ exclude_words = exclude_words
39
+ since = since
40
+ upto = upto
41
+ minimum_replies = minimum_replies
42
+ minimum_retweets = minimum_retweets
43
+ maximum_distance = maximum_distance
44
+ query = query
45
+ maximum_tweets = maximum_tweets
46
+ language = language
47
+ emoji = emoji
48
+ top_tweets = top_tweets
49
+ super
50
+ end
51
+
52
+ alias_method :usernames=, :username=
53
+ alias_method :usernames, :username
54
+ alias_method :top_tweets?, :top_tweets
55
+ end
56
+ end
@@ -0,0 +1,4 @@
1
+ module GoldTweets
2
+ # Reflects interesting data returned by the API
3
+ Tweet = Struct.new(:username, :text, :retweets, :faves, :replies, :hashtags, :mentions, :to, :id, :permalink, :author_id, :timestamp, :geo, :links)
4
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: goldtweets
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tina Wuest
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5'
55
+ description: Ruby port of the Python GetOldTweets3 Twitter library
56
+ email: tina@wuest.me
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - lib/goldtweets.rb
62
+ - lib/goldtweets/client.rb
63
+ - lib/goldtweets/search.rb
64
+ - lib/goldtweets/tweet.rb
65
+ homepage: https://gitlab.com/wuest/goldtweets
66
+ licenses:
67
+ - MIT
68
+ metadata: {}
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.1.2
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Search Twitter including older tweets
88
+ test_files: []