goldtweets 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d05e029b4a06fa88926306487069f5778f83a3eb40f6ef714004636319d0ed26
4
+ data.tar.gz: f645ec888e05149aa30c387f8bd763680ac6869edd643d112d32ef4fec60f91e
5
+ SHA512:
6
+ metadata.gz: a7326365bed9d11762dd2ecc63f32e600dd60545b709decf60a605b0a0716528121ca61ad9ab76841da4d313ad6bf20e18d36a42e3fd19f30aae83a8f97d0418
7
+ data.tar.gz: f18dfbc3e5bf8155c2213fcf36aa2a3f9fc99908ed51fab79535f32e3bf5c77d9b10e4ac5c8abdd2c98a48978df4e88c072ac267f2a6cb486ade7a6d6e6ea6e8
@@ -0,0 +1,10 @@
1
+ require 'goldtweets/client'
2
+ require 'goldtweets/search'
3
+ require 'goldtweets/tweet'
4
+
5
+ module GoldTweets
6
+ # Convenience method, identical to calling GoldTweets::Client.get_tweets
7
+ def self.get_tweets(criteria)
8
+ ::GoldTweets::Client.get_tweets(criteria)
9
+ end
10
+ end
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'net/http'
6
+ require 'uri'
7
+
8
+ require 'goldtweets/tweet'
9
+
10
+ module GoldTweets
11
+ module Client
12
+ # User agents to present to Twitter search
13
+ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
14
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
15
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0',
16
+ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
17
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
18
+ 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
19
+ 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
20
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'
21
+ ].freeze
22
+
23
+ # Static list of headers to be sent with API requests
24
+ DEFAULT_HEADERS = { 'Host' => 'twitter.com',
25
+ 'Accept' => 'application/json, text/javascript, */*; q=0.01',
26
+ 'Accept-Language' => 'en-US,en;q=0.5',
27
+ 'X-Requested-With' => 'XMLHttpRequest',
28
+ 'Connection' => 'keep-alive'
29
+ }.freeze
30
+ # How many usernames to put in a single search
31
+ USERNAMES_PER_BATCH = 20
32
+
33
+ # URLs for searching and generating permalinks back to tweets
34
+ SEARCH_PREFIX = 'https://twitter.com/i/search/timeline?'
35
+ PERMALINK_PREFIX = 'https://twitter.com'
36
+
37
+ # Static list of parameters sent with a search
38
+ DEFAULT_PARAMETERS = { 'vertical' => 'news',
39
+ 'src' => 'typd',
40
+ 'include_available_features' => '1',
41
+ 'include_entities' => '1',
42
+ 'reset_error_state' => 'false'
43
+ }.freeze
44
+
45
+ # XPath selectors
46
+ TWEETS_SELECTOR = "//div[contains(concat(' ', normalize-space(@class), ' '), ' js-stream-tweet ') and not(contains(concat(' ', normalize-space(@class), ' '), ' withheld-tweet '))]"
47
+ USERNAMES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' username ') and contains(concat(' ', normalize-space(@class), ' '), ' u-dir ')]/b"
48
+ AUTHORID_SELECTOR = ".//a[contains(concat(' ', normalize-space(@class), ' '), ' js-user-profile-link ')]"
49
+ CONTENT_SELECTOR = ".//p[contains(concat(' ', normalize-space(@class), ' '), ' js-tweet-text ')]"
50
+ RETWEETS_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--retweet ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
51
+ FAVORITES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--favorite ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
52
+ REPLIES_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-action--reply ')]/span[contains(concat(' ', normalize-space(@class), ' '), ' ProfileTweet-actionCount ')]"
53
+ TIMESTAMP_SELECTOR = ".//small[contains(concat(' ', normalize-space(@class), ' '), ' time ')]//span[contains(concat(' ', normalize-space(@class), ' '), ' js-short-timestamp ')]"
54
+ GEO_SELECTOR = ".//span[contains(concat(' ', normalize-space(@class), ' '), ' Tweet-geo ')]"
55
+ LINK_SELECTOR = ".//a"
56
+
57
+ # Interim response structure useful for tweet fetch and processing logic
58
+ Response = Struct.new(:body, :new_cursor, :new_cookies, :more_items)
59
+
60
+ # Fetch tweets based on a GoldTweets::Search object
61
+ # This functionality is presently lacking several features of the original
62
+ # python library - proxy support, emoji handling, and allowing a provided
63
+ # block to be run on tweets as they are processed among them.
64
+ def self.get_tweets(criteria)
65
+ user_agent = USER_AGENTS.sample
66
+ cookie_jar = ''
67
+ usernames = usernames_for(criteria.usernames)
68
+ batches = usernames.each_slice(USERNAMES_PER_BATCH).to_a
69
+
70
+ batches.map do |batch|
71
+ refresh_cursor = ''
72
+ batch_results_count = 0
73
+ collected_tweets = []
74
+
75
+ criteria.usernames = batch
76
+ loop do
77
+ response = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
78
+ cookie_jar = response.new_cookies if response.new_cookies
79
+ refresh_cursor = response.new_cursor
80
+
81
+ tweets = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet))
82
+ collected_tweets << tweets
83
+ batch_results_count += tweets.length
84
+
85
+ if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items)
86
+ break
87
+ end
88
+ end
89
+
90
+ collected_tweets.flatten
91
+ end.flatten
92
+ end
93
+
94
+ private
95
+
96
+ # Coerce usernames into a suitable representation for batching
97
+ def self.usernames_for(users)
98
+ case users
99
+ when Array
100
+ users.map { |u| u.sub(/^@/, '').downcase }
101
+ when String
102
+ [ users.sub(/^@/, '').downcase ]
103
+ else
104
+ [[]]
105
+ end
106
+ end
107
+
108
+ # Function for folding a list of Nokogiri objects fetched from Twitter into
109
+ # a list of GoldTweets::Tweet objects
110
+ def self.parse_tweet(tweets, tweet)
111
+ users = tweet.xpath(USERNAMES_SELECTOR).map(&:text)
112
+ return tweets if users.empty?
113
+
114
+ message = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first
115
+ rt,f,re = tweet_interactions(tweet)
116
+ permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path')
117
+ author = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first
118
+ timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first
119
+ links = tweet.xpath(LINK_SELECTOR)
120
+ hts, ats = tweet_hashtags_and_mentions(links)
121
+ geo_span = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s
122
+ ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself)
123
+
124
+ tweet_container = ::GoldTweets::Tweet.new(users.first)
125
+ tweet_container.to = users[1]
126
+ tweet_container.text = message
127
+ tweet_container.retweets = rt
128
+ tweet_container.faves = f
129
+ tweet_container.replies = re
130
+ tweet_container.id = tweet.attr('data-tweet-id')
131
+ tweet_container.permalink = permalink
132
+ tweet_container.author_id = author
133
+ tweet_container.timestamp = timestamp
134
+ tweet_container.hashtags = hts
135
+ tweet_container.mentions = ats
136
+ tweet_container.geo = geo_span
137
+ tweet_container.links = ext_links
138
+
139
+ tweets + [tweet_container]
140
+ end
141
+
142
+ # Normalize spacing and remove errant spaces following pound signs, at
143
+ # signs, and dollar signs
144
+ def self.sanitize_message(tweet)
145
+ tweet.text
146
+ .gsub(/\s+/, ' ')
147
+ .gsub(/([#@\$]) /, '\1')
148
+ end
149
+
150
+ # Classify interactions (retweets, faves, and replies to a given tweet)
151
+ def self.tweet_interactions(tweet)
152
+ [RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector|
153
+ tweet.xpath(selector)
154
+ .map { |node| node.attr('data-tweet-stat-count') }
155
+ .first
156
+ .to_i
157
+ end
158
+ end
159
+
160
+ # Classify links belonging to hashtags and (outgoing) mentions within a
161
+ # tweet
162
+ def self.tweet_hashtags_and_mentions(links)
163
+ links.reduce([[], []]) do |(hashtags, mentions), link|
164
+ href = link.attr('href')
165
+ return [hashtags, mentions] unless href.to_s[0] == '/'
166
+ if link.attr('data-mentioned-user-id')
167
+ [hashtags, mentions + ['@' + href[1..-1]]]
168
+ elsif /^\/hashtag\//.match(href)
169
+ [hashtags + [href.sub(/(?:^\/hashtag\/)/, '#').sub(/(?:\?.*$)/, '')], mentions]
170
+ else
171
+ [hashtags, mentions]
172
+ end
173
+ end
174
+ end
175
+
176
+ # Perform a search for tweets based on criteria specified
177
+ def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
178
+ search = DEFAULT_PARAMETERS.dup
179
+ get_data = []
180
+ search['f'] = 'tweets' unless criteria.top_tweets?
181
+ search['l'] = criteria.language if criteria.language
182
+
183
+ get_data << criteria.query if criteria.query
184
+ get_data << ([''] + criteria.exclude_words).join(' -')
185
+ get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username
186
+ get_data << "since:#{criteria.since}" if criteria.since
187
+ get_data << "until:#{criteria.upto}" if criteria.upto
188
+ get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies
189
+ get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves
190
+ get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets
191
+
192
+ if criteria.maximum_distance
193
+ if criteria.near
194
+ get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}"
195
+ elsif criteria.lat && criteria.lon
196
+ get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}"
197
+ end
198
+ end
199
+
200
+ search['q'] = get_data.join(' ').strip
201
+ search['max_position'] = refresh_cursor
202
+
203
+ url = SEARCH_PREFIX + URI.encode_www_form(search)
204
+ uri = URI(url)
205
+
206
+ Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
207
+ request = Net::HTTP::Get.new(uri)
208
+ DEFAULT_HEADERS.each { |(k,v)| request[k] = v }
209
+ request['User-Agent'] = user_agent
210
+ request['Referer'] = url
211
+ request['Set-Cookie'] = cookie_jar
212
+
213
+ response = http.request(request)
214
+
215
+ json = JSON.parse(response.body)
216
+ html = Nokogiri::HTML(json['items_html'])
217
+ new_cursor = json['min_position']
218
+ new_cookies = response['set-cookie']
219
+ unfinished = json['has_more_items']
220
+
221
+ return Response.new(html, new_cursor, new_cookies, unfinished)
222
+ end
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,56 @@
1
+ module GoldTweets
2
+ Search = Struct.new(:username,
3
+ :exclude_words,
4
+ :since,
5
+ :upto,
6
+ :minimum_replies,
7
+ :minimum_faves,
8
+ :minimum_retweets,
9
+ :maximum_distance,
10
+ :near,
11
+ :lat,
12
+ :lon,
13
+ :query,
14
+ :maximum_tweets,
15
+ :language,
16
+ :emoji,
17
+ :top_tweets,
18
+ keyword_init: true) do
19
+
20
+ # Set default values, otherwise no additional work done here.
21
+ def initialize(username: nil,
22
+ exclude_words: [],
23
+ since: nil,
24
+ upto: nil,
25
+ minimum_replies: nil,
26
+ minimum_faves: nil,
27
+ minimum_retweets: nil,
28
+ maximum_distance: '15mi',
29
+ near: nil,
30
+ lat: nil,
31
+ lon: nil,
32
+ query: nil,
33
+ maximum_tweets: 0,
34
+ language: '',
35
+ emoji: :ignore,
36
+ top_tweets: false)
37
+ username = username
38
+ exclude_words = exclude_words
39
+ since = since
40
+ upto = upto
41
+ minimum_replies = minimum_replies
42
+ minimum_retweets = minimum_retweets
43
+ maximum_distance = maximum_distance
44
+ query = query
45
+ maximum_tweets = maximum_tweets
46
+ language = language
47
+ emoji = emoji
48
+ top_tweets = top_tweets
49
+ super
50
+ end
51
+
52
+ alias_method :usernames=, :username=
53
+ alias_method :usernames, :username
54
+ alias_method :top_tweets?, :top_tweets
55
+ end
56
+ end
@@ -0,0 +1,4 @@
1
+ module GoldTweets
2
+ # Reflects interesting data returned by the API
3
+ Tweet = Struct.new(:username, :text, :retweets, :faves, :replies, :hashtags, :mentions, :to, :id, :permalink, :author_id, :timestamp, :geo, :links)
4
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: goldtweets
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tina Wuest
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5'
55
+ description: Ruby port of the Python GetOldTweets3 Twitter library
56
+ email: tina@wuest.me
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - lib/goldtweets.rb
62
+ - lib/goldtweets/client.rb
63
+ - lib/goldtweets/search.rb
64
+ - lib/goldtweets/tweet.rb
65
+ homepage: https://gitlab.com/wuest/goldtweets
66
+ licenses:
67
+ - MIT
68
+ metadata: {}
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.1.2
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Search Twitter including older tweets
88
+ test_files: []