twittercrawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/twitter_crawler.rb +65 -0
- data/lib/twitter_parser.rb +98 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4f234ed55d4b0d82dce0b38b43d1a584596f2d45
|
4
|
+
data.tar.gz: 2493c492eb0e9761073ef546a4039b39a8b2f883
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 023df126446294405e8ceafeaa9146d61561e05652adee497c7b78034e5146e5f0756f66c82d6273654e9bd7cf86e8ca8ee4dcda22076f8b682d3cd59fb73cdc
|
7
|
+
data.tar.gz: 3aea854fc09e6dfb5dcb27f449f98d1d9302eb1ca88cdbcf39d1ae30e71070b4773402d7c6f44e1167175d4bb9f879bb456d6e1da01eb2a54f085a963e9141ed
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'requestmanager'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require 'pry'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
load 'twitter_parser.rb'
|
7
|
+
|
8
|
+
class TwitterCrawler
|
9
|
+
def initialize(search_term, operator, requests)
|
10
|
+
@search_term = search_term
|
11
|
+
@operator = operator
|
12
|
+
@requests = requests
|
13
|
+
@output = Array.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generate advanced query
|
17
|
+
def gen_query
|
18
|
+
if @operator
|
19
|
+
return URI.encode(@search_term + " " + @operator)
|
20
|
+
else
|
21
|
+
return URI.encode(@search_term)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def crawl
|
26
|
+
@requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
|
27
|
+
scroll_down(0)
|
28
|
+
get_tweets
|
29
|
+
@requests.close_all_browsers
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the tweets on the page
|
33
|
+
def get_tweets
|
34
|
+
browser = @requests.get_most_recent_browser[1].first
|
35
|
+
tweets = browser.find_elements(class: "tweet")
|
36
|
+
|
37
|
+
# Parse each tweet
|
38
|
+
tweets.each do |tweet|
|
39
|
+
tweet_html = tweet.attribute("innerHTML")
|
40
|
+
parser = TwitterParser.new(tweet_html)
|
41
|
+
@output.push(parser.parse_tweet)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scroll down to the bottom
|
46
|
+
def scroll_down(last_tweet_num)
|
47
|
+
# Scroll down to last tweet
|
48
|
+
browser = @requests.get_most_recent_browser[1].first
|
49
|
+
tweets = browser.find_elements(class: "tweet")
|
50
|
+
tweets[tweets.length-2].location_once_scrolled_into_view
|
51
|
+
|
52
|
+
# Check if it should be rerun
|
53
|
+
sleep(1)
|
54
|
+
tweet_count = browser.find_elements(class: "tweet").length
|
55
|
+
if tweet_count > last_tweet_num
|
56
|
+
scroll_down(tweet_count)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Generate JSON for output
|
61
|
+
def gen_json
|
62
|
+
JSON.pretty_generate(@output)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
class TwitterParser
|
5
|
+
def initialize(tweet)
|
6
|
+
@tweet = Nokogiri::HTML.parse(tweet)
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse the individual tweet
|
10
|
+
def parse_tweet
|
11
|
+
if !@tweet.text.empty?
|
12
|
+
return {
|
13
|
+
tweet_text: get_tweet_text,
|
14
|
+
username: get_username,
|
15
|
+
fullname: get_fullname,
|
16
|
+
user_id: get_user_id,
|
17
|
+
reply_to_user: get_reply_to_user[0],
|
18
|
+
reply_to_uid: get_reply_to_user[1],
|
19
|
+
tweet_time: get_tweet_time,
|
20
|
+
tweet_link: get_tweet_link,
|
21
|
+
retweet_count: get_retweet_count,
|
22
|
+
favorite_count: get_favorite_count,
|
23
|
+
reply_count: get_reply_count,
|
24
|
+
mention_names: get_mentions[0],
|
25
|
+
mention_uids: get_mentions[1]
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the username
|
31
|
+
def get_username
|
32
|
+
@tweet.css(".username").text
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the fullname
|
36
|
+
def get_fullname
|
37
|
+
@tweet.css(".fullname").text
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get user ID number
|
41
|
+
def get_user_id
|
42
|
+
@tweet.css(".js-user-profile-link").css(".account-group")[0]["data-user-id"]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the tweet text
|
46
|
+
def get_tweet_text
|
47
|
+
@tweet.css(".js-tweet-text-container").text.lstrip.strip
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get the time for the tweet
|
51
|
+
def get_tweet_time
|
52
|
+
DateTime.parse(@tweet.css(".tweet-timestamp")[0]["title"]).strftime('%d %b %Y %H:%M:%S')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the link to the tweet
|
56
|
+
def get_tweet_link
|
57
|
+
"https://twitter.com"+@tweet.css(".tweet-timestamp")[0]['href']
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the # of retweets
|
61
|
+
def get_retweet_count
|
62
|
+
@tweet.css(".ProfileTweet-action--retweet")[0].css("span")[0]['data-tweet-stat-count']
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the # of favorites
|
66
|
+
def get_favorite_count
|
67
|
+
@tweet.css(".ProfileTweet-action--favorite")[0].css("span")[0]['data-tweet-stat-count']
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the # of replies
|
71
|
+
def get_reply_count
|
72
|
+
@tweet.css(".ProfileTweet-action--reply")[0].css("span")[0]['data-tweet-stat-count']
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get the user tweet is replying to (if any)
|
76
|
+
def get_reply_to_user
|
77
|
+
reply_to = @tweet.css("span").select{|s| s.text.include?("In reply")}[0]
|
78
|
+
if reply_to
|
79
|
+
reply_to_user = reply_to.css("a")[0]['href'].gsub("/", "@")
|
80
|
+
reply_to_uid = reply_to.css("a")[0]['data-user-id']
|
81
|
+
return reply_to_user, reply_to_uid
|
82
|
+
else
|
83
|
+
return nil, nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get the mentioned accounts (if any)
|
88
|
+
def get_mentions
|
89
|
+
mentions = @tweet.css(".twitter-atreply")
|
90
|
+
if !mentions.empty?
|
91
|
+
mention_names = mentions.map{|t| t.text}
|
92
|
+
mention_uids = mentions.map{|t| t['data-mentioned-user-id']}
|
93
|
+
return mention_names, mention_uids
|
94
|
+
else
|
95
|
+
return nil, nil
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twittercrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Crawls Twitter
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/twitter_crawler.rb
|
20
|
+
- lib/twitter_parser.rb
|
21
|
+
homepage: https://github.com/TransparencyToolkit/TwitterCrawler
|
22
|
+
licenses:
|
23
|
+
- GPL
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.4.8
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: Crawls Twitter
|
45
|
+
test_files: []
|
46
|
+
has_rdoc:
|