twittercrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4f234ed55d4b0d82dce0b38b43d1a584596f2d45
4
+ data.tar.gz: 2493c492eb0e9761073ef546a4039b39a8b2f883
5
+ SHA512:
6
+ metadata.gz: 023df126446294405e8ceafeaa9146d61561e05652adee497c7b78034e5146e5f0756f66c82d6273654e9bd7cf86e8ca8ee4dcda22076f8b682d3cd59fb73cdc
7
+ data.tar.gz: 3aea854fc09e6dfb5dcb27f449f98d1d9302eb1ca88cdbcf39d1ae30e71070b4773402d7c6f44e1167175d4bb9f879bb456d6e1da01eb2a54f085a963e9141ed
@@ -0,0 +1,65 @@
1
+ require 'requestmanager'
2
+ require 'selenium-webdriver'
3
+ require 'pry'
4
+ require 'nokogiri'
5
+
6
+ load 'twitter_parser.rb'
7
+
8
+ class TwitterCrawler
9
+ def initialize(search_term, operator, requests)
10
+ @search_term = search_term
11
+ @operator = operator
12
+ @requests = requests
13
+ @output = Array.new
14
+ end
15
+
16
+ # Generate advanced query
17
+ def gen_query
18
+ if @operator
19
+ return URI.encode(@search_term + " " + @operator)
20
+ else
21
+ return URI.encode(@search_term)
22
+ end
23
+ end
24
+
25
+ def crawl
26
+ @requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
27
+ scroll_down(0)
28
+ get_tweets
29
+ @requests.close_all_browsers
30
+ end
31
+
32
+ # Get the tweets on the page
33
+ def get_tweets
34
+ browser = @requests.get_most_recent_browser[1].first
35
+ tweets = browser.find_elements(class: "tweet")
36
+
37
+ # Parse each tweet
38
+ tweets.each do |tweet|
39
+ tweet_html = tweet.attribute("innerHTML")
40
+ parser = TwitterParser.new(tweet_html)
41
+ @output.push(parser.parse_tweet)
42
+ end
43
+ end
44
+
45
+ # Scroll down to the bottom
46
+ def scroll_down(last_tweet_num)
47
+ # Scroll down to last tweet
48
+ browser = @requests.get_most_recent_browser[1].first
49
+ tweets = browser.find_elements(class: "tweet")
50
+ tweets[tweets.length-2].location_once_scrolled_into_view
51
+
52
+ # Check if it should be rerun
53
+ sleep(1)
54
+ tweet_count = browser.find_elements(class: "tweet").length
55
+ if tweet_count > last_tweet_num
56
+ scroll_down(tweet_count)
57
+ end
58
+ end
59
+
60
+ # Generate JSON for output
61
+ def gen_json
62
+ JSON.pretty_generate(@output)
63
+ end
64
+ end
65
+
@@ -0,0 +1,98 @@
1
+ require 'nokogiri'
2
+ require 'pry'
3
+
4
+ class TwitterParser
5
+ def initialize(tweet)
6
+ @tweet = Nokogiri::HTML.parse(tweet)
7
+ end
8
+
9
+ # Parse the individual tweet
10
+ def parse_tweet
11
+ if !@tweet.text.empty?
12
+ return {
13
+ tweet_text: get_tweet_text,
14
+ username: get_username,
15
+ fullname: get_fullname,
16
+ user_id: get_user_id,
17
+ reply_to_user: get_reply_to_user[0],
18
+ reply_to_uid: get_reply_to_user[1],
19
+ tweet_time: get_tweet_time,
20
+ tweet_link: get_tweet_link,
21
+ retweet_count: get_retweet_count,
22
+ favorite_count: get_favorite_count,
23
+ reply_count: get_reply_count,
24
+ mention_names: get_mentions[0],
25
+ mention_uids: get_mentions[1]
26
+ }
27
+ end
28
+ end
29
+
30
+ # Get the username
31
+ def get_username
32
+ @tweet.css(".username").text
33
+ end
34
+
35
+ # Get the fullname
36
+ def get_fullname
37
+ @tweet.css(".fullname").text
38
+ end
39
+
40
+ # Get user ID number
41
+ def get_user_id
42
+ @tweet.css(".js-user-profile-link").css(".account-group")[0]["data-user-id"]
43
+ end
44
+
45
+ # Get the tweet text
46
+ def get_tweet_text
47
+ @tweet.css(".js-tweet-text-container").text.lstrip.strip
48
+ end
49
+
50
+ # Get the time for the tweet
51
+ def get_tweet_time
52
+ DateTime.parse(@tweet.css(".tweet-timestamp")[0]["title"]).strftime('%d %b %Y %H:%M:%S')
53
+ end
54
+
55
+ # Get the link to the tweet
56
+ def get_tweet_link
57
+ "https://twitter.com"+@tweet.css(".tweet-timestamp")[0]['href']
58
+ end
59
+
60
+ # Get the # of retweets
61
+ def get_retweet_count
62
+ @tweet.css(".ProfileTweet-action--retweet")[0].css("span")[0]['data-tweet-stat-count']
63
+ end
64
+
65
+ # Get the # of favorites
66
+ def get_favorite_count
67
+ @tweet.css(".ProfileTweet-action--favorite")[0].css("span")[0]['data-tweet-stat-count']
68
+ end
69
+
70
+ # Get the # of replies
71
+ def get_reply_count
72
+ @tweet.css(".ProfileTweet-action--reply")[0].css("span")[0]['data-tweet-stat-count']
73
+ end
74
+
75
+ # Get the user tweet is replying to (if any)
76
+ def get_reply_to_user
77
+ reply_to = @tweet.css("span").select{|s| s.text.include?("In reply")}[0]
78
+ if reply_to
79
+ reply_to_user = reply_to.css("a")[0]['href'].gsub("/", "@")
80
+ reply_to_uid = reply_to.css("a")[0]['data-user-id']
81
+ return reply_to_user, reply_to_uid
82
+ else
83
+ return nil, nil
84
+ end
85
+ end
86
+
87
+ # Get the mentioned accounts (if any)
88
+ def get_mentions
89
+ mentions = @tweet.css(".twitter-atreply")
90
+ if !mentions.empty?
91
+ mention_names = mentions.map{|t| t.text}
92
+ mention_uids = mentions.map{|t| t['data-mentioned-user-id']}
93
+ return mention_names, mention_uids
94
+ else
95
+ return nil, nil
96
+ end
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twittercrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-02-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Crawls Twitter
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/twitter_crawler.rb
20
+ - lib/twitter_parser.rb
21
+ homepage: https://github.com/TransparencyToolkit/TwitterCrawler
22
+ licenses:
23
+ - GPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.8
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Crawls Twitter
45
+ test_files: []
46
+ has_rdoc: