twittercrawler 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1230a596110d9376fa6243651b2994f2d0228afb
4
- data.tar.gz: f93449ae2abbc85227da02665f31ea6848712615
3
+ metadata.gz: 8b2a8f686154fcb64246ed8c3bef02f324df3892
4
+ data.tar.gz: f8219382c56cfacad13e7fbd55a25e156280976f
5
5
  SHA512:
6
- metadata.gz: 2dd0c8b7e41565e4f3c71bfbbabfbe22b8f9cbec6a3bde803270ec62e11c591145e1e0bd4263a392627341f6ac96b8c51c89d39da9a90f9bf36465e41d69a66f
7
- data.tar.gz: bbdb93efc8b9450b6da96f31c90d1446781ced81d5a2adf6e7540cb09b983d9bd5a1be12f844bca71dd36894e1836c8bfc05a810df4d088f6062ac19cc4ede49
6
+ metadata.gz: f21e73e1bd854174182fee749153be4499c2b2f6742ae6ca09aa1e85e2d100e9eab9e2af3898400aa0f0bb6d9b071292cf4393d80b45cfaf37d0e22780a0834b
7
+ data.tar.gz: 7a8c376ffcaaeb0ae55964251dd7a0efaf5bece5fd81a03c99eed231784bc29d1f5c02855b31ee06931faa97e3da988146210afecfbb6a0304497fd0684f12f1
@@ -22,7 +22,8 @@ class TwitterParser
22
22
  favorite_count: get_favorite_count,
23
23
  reply_count: get_reply_count,
24
24
  mention_names: get_mentions[0],
25
- mention_uids: get_mentions[1]
25
+ mention_uids: get_mentions[1],
26
+ time_collected: Time.now
26
27
  }
27
28
  end
28
29
  end
@@ -2,14 +2,14 @@ require 'requestmanager'
2
2
  require 'selenium-webdriver'
3
3
  require 'pry'
4
4
  require 'nokogiri'
5
+ require 'curb'
5
6
 
6
7
  load 'twitter_parser.rb'
7
8
 
8
9
  class TwitterCrawler
9
- def initialize(search_term, operator, requests, cm_hash)
10
+ def initialize(search_term, operator, cm_hash)
10
11
  @search_term = search_term
11
12
  @operator = operator
12
- @requests = requests
13
13
  @output = Array.new
14
14
 
15
15
  # Handle crawler manager info
@@ -26,45 +26,56 @@ class TwitterCrawler
26
26
  end
27
27
  end
28
28
 
29
- def crawl
30
- @requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
31
- scroll_down(0)
32
- get_tweets
33
- @requests.close_all_browsers
29
+ # Parse the tweets into html
30
+ def parse_tweets(tweets)
31
+ return tweets.map do |tweet|
32
+ parser = TwitterParser.new(tweet.to_html)
33
+ parser.parse_tweet
34
+ end
34
35
  end
35
36
 
36
- # Get the tweets on the page
37
- def get_tweets
38
- browser = @requests.get_most_recent_browser[1].first
39
- tweets = browser.find_elements(class: "tweet")
40
-
41
- # Parse each tweet
42
- tweets.each do |tweet|
43
- # Parse tweet
44
- tweet_html = tweet.attribute("innerHTML")
45
- parser = TwitterParser.new(tweet_html)
46
- parsed_tweet = parser.parse_tweet
47
-
48
- # Report results
49
- if parsed_tweet
50
- report_results([parsed_tweet], parsed_tweet[:tweet_link])
51
- end
37
+ # Generate the query url for Twitter
38
+ def gen_query_url(start_tweet, end_tweet)
39
+ # Base query url
40
+ query_url = "https://twitter.com/i/search/timeline?f=tweets&vertical=news&q="+gen_query+"&src=typd&include_available_features=1&include_entities=1"
41
+
42
+ # Gen query URL
43
+ if start_tweet && end_tweet
44
+ query_url += "&max_position=TWEET-"+start_tweet+"-"+end_tweet
52
45
  end
46
+ return query_url
53
47
  end
54
48
 
55
- # Scroll down to the bottom
56
- def scroll_down(last_tweet_num)
57
- # Scroll down to last tweet
58
- browser = @requests.get_most_recent_browser[1].first
59
- tweets = browser.find_elements(class: "tweet")
60
- tweets[tweets.length-2].location_once_scrolled_into_view
61
-
62
- # Check if it should be rerun
63
- sleep(1)
64
- tweet_count = browser.find_elements(class: "tweet").length
65
- if tweet_count > last_tweet_num
66
- scroll_down(tweet_count)
67
- end
49
+ # Query tweets
50
+ def query_tweets(start_tweet, end_tweet)
51
+ # Run Query and parse results
52
+ c = Curl::Easy.perform(gen_query_url(start_tweet, end_tweet))
53
+ curl_items = JSON.parse(c.body_str)
54
+ tweets = Nokogiri::HTML.parse(curl_items["items_html"]).css(".tweet") if curl_items["items_html"]
55
+
56
+ # Save results
57
+ parsed_tweets = parse_tweets(tweets)
58
+ report_results(parsed_tweets, "Saving "+parsed_tweets.length.to_s+" tweets")
59
+
60
+ # Recurse when needed
61
+ if !parsed_tweets.empty?
62
+ start_tweet, end_tweet = get_tweet_range(parsed_tweets, end_tweet)
63
+ query_tweets(start_tweet, end_tweet)
64
+ end
65
+ end
66
+
67
+ # Get the ID for a tweet
68
+ def get_tweet_id(tweet)
69
+ return tweet[:tweet_link].split("/").last
70
+ end
71
+
72
+ # Get start and end tweets
73
+ def get_tweet_range(parsed_tweets, end_tweet)
74
+ if end_tweet # Keeep latest tweet as same
75
+ return get_tweet_id(parsed_tweets.last), end_tweet
76
+ else # Get updated start tweet
77
+ return get_tweet_id(parsed_tweets.last), get_tweet_id(parsed_tweets.first)
78
+ end
68
79
  end
69
80
 
70
81
  # Figure out how to report results
@@ -98,3 +109,4 @@ class TwitterCrawler
98
109
  end
99
110
  end
100
111
 
112
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twittercrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-03 00:00:00.000000000 Z
11
+ date: 2017-02-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls Twitter
14
14
  email: shidash@shidash.com