twittercrawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/twitter_crawler.rb +65 -0
- data/lib/twitter_parser.rb +98 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4f234ed55d4b0d82dce0b38b43d1a584596f2d45
|
4
|
+
data.tar.gz: 2493c492eb0e9761073ef546a4039b39a8b2f883
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 023df126446294405e8ceafeaa9146d61561e05652adee497c7b78034e5146e5f0756f66c82d6273654e9bd7cf86e8ca8ee4dcda22076f8b682d3cd59fb73cdc
|
7
|
+
data.tar.gz: 3aea854fc09e6dfb5dcb27f449f98d1d9302eb1ca88cdbcf39d1ae30e71070b4773402d7c6f44e1167175d4bb9f879bb456d6e1da01eb2a54f085a963e9141ed
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'requestmanager'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require 'pry'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
load 'twitter_parser.rb'
|
7
|
+
|
8
|
+
class TwitterCrawler
|
9
|
+
def initialize(search_term, operator, requests)
|
10
|
+
@search_term = search_term
|
11
|
+
@operator = operator
|
12
|
+
@requests = requests
|
13
|
+
@output = Array.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generate advanced query
|
17
|
+
def gen_query
|
18
|
+
if @operator
|
19
|
+
return URI.encode(@search_term + " " + @operator)
|
20
|
+
else
|
21
|
+
return URI.encode(@search_term)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def crawl
|
26
|
+
@requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
|
27
|
+
scroll_down(0)
|
28
|
+
get_tweets
|
29
|
+
@requests.close_all_browsers
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the tweets on the page
|
33
|
+
def get_tweets
|
34
|
+
browser = @requests.get_most_recent_browser[1].first
|
35
|
+
tweets = browser.find_elements(class: "tweet")
|
36
|
+
|
37
|
+
# Parse each tweet
|
38
|
+
tweets.each do |tweet|
|
39
|
+
tweet_html = tweet.attribute("innerHTML")
|
40
|
+
parser = TwitterParser.new(tweet_html)
|
41
|
+
@output.push(parser.parse_tweet)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scroll down to the bottom
|
46
|
+
def scroll_down(last_tweet_num)
|
47
|
+
# Scroll down to last tweet
|
48
|
+
browser = @requests.get_most_recent_browser[1].first
|
49
|
+
tweets = browser.find_elements(class: "tweet")
|
50
|
+
tweets[tweets.length-2].location_once_scrolled_into_view
|
51
|
+
|
52
|
+
# Check if it should be rerun
|
53
|
+
sleep(1)
|
54
|
+
tweet_count = browser.find_elements(class: "tweet").length
|
55
|
+
if tweet_count > last_tweet_num
|
56
|
+
scroll_down(tweet_count)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Generate JSON for output
|
61
|
+
def gen_json
|
62
|
+
JSON.pretty_generate(@output)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
class TwitterParser
|
5
|
+
def initialize(tweet)
|
6
|
+
@tweet = Nokogiri::HTML.parse(tweet)
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse the individual tweet
|
10
|
+
def parse_tweet
|
11
|
+
if !@tweet.text.empty?
|
12
|
+
return {
|
13
|
+
tweet_text: get_tweet_text,
|
14
|
+
username: get_username,
|
15
|
+
fullname: get_fullname,
|
16
|
+
user_id: get_user_id,
|
17
|
+
reply_to_user: get_reply_to_user[0],
|
18
|
+
reply_to_uid: get_reply_to_user[1],
|
19
|
+
tweet_time: get_tweet_time,
|
20
|
+
tweet_link: get_tweet_link,
|
21
|
+
retweet_count: get_retweet_count,
|
22
|
+
favorite_count: get_favorite_count,
|
23
|
+
reply_count: get_reply_count,
|
24
|
+
mention_names: get_mentions[0],
|
25
|
+
mention_uids: get_mentions[1]
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the username
|
31
|
+
def get_username
|
32
|
+
@tweet.css(".username").text
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the fullname
|
36
|
+
def get_fullname
|
37
|
+
@tweet.css(".fullname").text
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get user ID number
|
41
|
+
def get_user_id
|
42
|
+
@tweet.css(".js-user-profile-link").css(".account-group")[0]["data-user-id"]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the tweet text
|
46
|
+
def get_tweet_text
|
47
|
+
@tweet.css(".js-tweet-text-container").text.lstrip.strip
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get the time for the tweet
|
51
|
+
def get_tweet_time
|
52
|
+
DateTime.parse(@tweet.css(".tweet-timestamp")[0]["title"]).strftime('%d %b %Y %H:%M:%S')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the link to the tweet
|
56
|
+
def get_tweet_link
|
57
|
+
"https://twitter.com"+@tweet.css(".tweet-timestamp")[0]['href']
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the # of retweets
|
61
|
+
def get_retweet_count
|
62
|
+
@tweet.css(".ProfileTweet-action--retweet")[0].css("span")[0]['data-tweet-stat-count']
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the # of favorites
|
66
|
+
def get_favorite_count
|
67
|
+
@tweet.css(".ProfileTweet-action--favorite")[0].css("span")[0]['data-tweet-stat-count']
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the # of replies
|
71
|
+
def get_reply_count
|
72
|
+
@tweet.css(".ProfileTweet-action--reply")[0].css("span")[0]['data-tweet-stat-count']
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get the user tweet is replying to (if any)
|
76
|
+
def get_reply_to_user
|
77
|
+
reply_to = @tweet.css("span").select{|s| s.text.include?("In reply")}[0]
|
78
|
+
if reply_to
|
79
|
+
reply_to_user = reply_to.css("a")[0]['href'].gsub("/", "@")
|
80
|
+
reply_to_uid = reply_to.css("a")[0]['data-user-id']
|
81
|
+
return reply_to_user, reply_to_uid
|
82
|
+
else
|
83
|
+
return nil, nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get the mentioned accounts (if any)
|
88
|
+
def get_mentions
|
89
|
+
mentions = @tweet.css(".twitter-atreply")
|
90
|
+
if !mentions.empty?
|
91
|
+
mention_names = mentions.map{|t| t.text}
|
92
|
+
mention_uids = mentions.map{|t| t['data-mentioned-user-id']}
|
93
|
+
return mention_names, mention_uids
|
94
|
+
else
|
95
|
+
return nil, nil
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twittercrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Crawls Twitter
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/twitter_crawler.rb
|
20
|
+
- lib/twitter_parser.rb
|
21
|
+
homepage: https://github.com/TransparencyToolkit/TwitterCrawler
|
22
|
+
licenses:
|
23
|
+
- GPL
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.4.8
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: Crawls Twitter
|
45
|
+
test_files: []
|
46
|
+
has_rdoc:
|