twittercrawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4f234ed55d4b0d82dce0b38b43d1a584596f2d45
4
+ data.tar.gz: 2493c492eb0e9761073ef546a4039b39a8b2f883
5
+ SHA512:
6
+ metadata.gz: 023df126446294405e8ceafeaa9146d61561e05652adee497c7b78034e5146e5f0756f66c82d6273654e9bd7cf86e8ca8ee4dcda22076f8b682d3cd59fb73cdc
7
+ data.tar.gz: 3aea854fc09e6dfb5dcb27f449f98d1d9302eb1ca88cdbcf39d1ae30e71070b4773402d7c6f44e1167175d4bb9f879bb456d6e1da01eb2a54f085a963e9141ed
@@ -0,0 +1,65 @@
1
+ require 'requestmanager'
2
+ require 'selenium-webdriver'
3
+ require 'pry'
4
+ require 'nokogiri'
5
+
6
+ load 'twitter_parser.rb'
7
+
8
+ class TwitterCrawler
9
+ def initialize(search_term, operator, requests)
10
+ @search_term = search_term
11
+ @operator = operator
12
+ @requests = requests
13
+ @output = Array.new
14
+ end
15
+
16
+ # Generate advanced query
17
+ def gen_query
18
+ if @operator
19
+ return URI.encode(@search_term + " " + @operator)
20
+ else
21
+ return URI.encode(@search_term)
22
+ end
23
+ end
24
+
25
+ def crawl
26
+ @requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
27
+ scroll_down(0)
28
+ get_tweets
29
+ @requests.close_all_browsers
30
+ end
31
+
32
+ # Get the tweets on the page
33
+ def get_tweets
34
+ browser = @requests.get_most_recent_browser[1].first
35
+ tweets = browser.find_elements(class: "tweet")
36
+
37
+ # Parse each tweet
38
+ tweets.each do |tweet|
39
+ tweet_html = tweet.attribute("innerHTML")
40
+ parser = TwitterParser.new(tweet_html)
41
+ @output.push(parser.parse_tweet)
42
+ end
43
+ end
44
+
45
+ # Scroll down to the bottom
46
+ def scroll_down(last_tweet_num)
47
+ # Scroll down to last tweet
48
+ browser = @requests.get_most_recent_browser[1].first
49
+ tweets = browser.find_elements(class: "tweet")
50
+ tweets[tweets.length-2].location_once_scrolled_into_view
51
+
52
+ # Check if it should be rerun
53
+ sleep(1)
54
+ tweet_count = browser.find_elements(class: "tweet").length
55
+ if tweet_count > last_tweet_num
56
+ scroll_down(tweet_count)
57
+ end
58
+ end
59
+
60
+ # Generate JSON for output
61
+ def gen_json
62
+ JSON.pretty_generate(@output)
63
+ end
64
+ end
65
+
@@ -0,0 +1,98 @@
1
+ require 'nokogiri'
2
+ require 'pry'
3
+
4
+ class TwitterParser
5
+ def initialize(tweet)
6
+ @tweet = Nokogiri::HTML.parse(tweet)
7
+ end
8
+
9
+ # Parse the individual tweet
10
+ def parse_tweet
11
+ if !@tweet.text.empty?
12
+ return {
13
+ tweet_text: get_tweet_text,
14
+ username: get_username,
15
+ fullname: get_fullname,
16
+ user_id: get_user_id,
17
+ reply_to_user: get_reply_to_user[0],
18
+ reply_to_uid: get_reply_to_user[1],
19
+ tweet_time: get_tweet_time,
20
+ tweet_link: get_tweet_link,
21
+ retweet_count: get_retweet_count,
22
+ favorite_count: get_favorite_count,
23
+ reply_count: get_reply_count,
24
+ mention_names: get_mentions[0],
25
+ mention_uids: get_mentions[1]
26
+ }
27
+ end
28
+ end
29
+
30
+ # Get the username
31
+ def get_username
32
+ @tweet.css(".username").text
33
+ end
34
+
35
+ # Get the fullname
36
+ def get_fullname
37
+ @tweet.css(".fullname").text
38
+ end
39
+
40
+ # Get user ID number
41
+ def get_user_id
42
+ @tweet.css(".js-user-profile-link").css(".account-group")[0]["data-user-id"]
43
+ end
44
+
45
+ # Get the tweet text
46
+ def get_tweet_text
47
+ @tweet.css(".js-tweet-text-container").text.lstrip.strip
48
+ end
49
+
50
+ # Get the time for the tweet
51
+ def get_tweet_time
52
+ DateTime.parse(@tweet.css(".tweet-timestamp")[0]["title"]).strftime('%d %b %Y %H:%M:%S')
53
+ end
54
+
55
+ # Get the link to the tweet
56
+ def get_tweet_link
57
+ "https://twitter.com"+@tweet.css(".tweet-timestamp")[0]['href']
58
+ end
59
+
60
+ # Get the # of retweets
61
+ def get_retweet_count
62
+ @tweet.css(".ProfileTweet-action--retweet")[0].css("span")[0]['data-tweet-stat-count']
63
+ end
64
+
65
+ # Get the # of favorites
66
+ def get_favorite_count
67
+ @tweet.css(".ProfileTweet-action--favorite")[0].css("span")[0]['data-tweet-stat-count']
68
+ end
69
+
70
+ # Get the # of replies
71
+ def get_reply_count
72
+ @tweet.css(".ProfileTweet-action--reply")[0].css("span")[0]['data-tweet-stat-count']
73
+ end
74
+
75
+ # Get the user tweet is replying to (if any)
76
+ def get_reply_to_user
77
+ reply_to = @tweet.css("span").select{|s| s.text.include?("In reply")}[0]
78
+ if reply_to
79
+ reply_to_user = reply_to.css("a")[0]['href'].gsub("/", "@")
80
+ reply_to_uid = reply_to.css("a")[0]['data-user-id']
81
+ return reply_to_user, reply_to_uid
82
+ else
83
+ return nil, nil
84
+ end
85
+ end
86
+
87
+ # Get the mentioned accounts (if any)
88
+ def get_mentions
89
+ mentions = @tweet.css(".twitter-atreply")
90
+ if !mentions.empty?
91
+ mention_names = mentions.map{|t| t.text}
92
+ mention_uids = mentions.map{|t| t['data-mentioned-user-id']}
93
+ return mention_names, mention_uids
94
+ else
95
+ return nil, nil
96
+ end
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twittercrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-02-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Crawls Twitter
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/twitter_crawler.rb
20
+ - lib/twitter_parser.rb
21
+ homepage: https://github.com/TransparencyToolkit/TwitterCrawler
22
+ licenses:
23
+ - GPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.8
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Crawls Twitter
45
+ test_files: []
46
+ has_rdoc: