twittercrawler 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1230a596110d9376fa6243651b2994f2d0228afb
4
- data.tar.gz: f93449ae2abbc85227da02665f31ea6848712615
3
+ metadata.gz: 8b2a8f686154fcb64246ed8c3bef02f324df3892
4
+ data.tar.gz: f8219382c56cfacad13e7fbd55a25e156280976f
5
5
  SHA512:
6
- metadata.gz: 2dd0c8b7e41565e4f3c71bfbbabfbe22b8f9cbec6a3bde803270ec62e11c591145e1e0bd4263a392627341f6ac96b8c51c89d39da9a90f9bf36465e41d69a66f
7
- data.tar.gz: bbdb93efc8b9450b6da96f31c90d1446781ced81d5a2adf6e7540cb09b983d9bd5a1be12f844bca71dd36894e1836c8bfc05a810df4d088f6062ac19cc4ede49
6
+ metadata.gz: f21e73e1bd854174182fee749153be4499c2b2f6742ae6ca09aa1e85e2d100e9eab9e2af3898400aa0f0bb6d9b071292cf4393d80b45cfaf37d0e22780a0834b
7
+ data.tar.gz: 7a8c376ffcaaeb0ae55964251dd7a0efaf5bece5fd81a03c99eed231784bc29d1f5c02855b31ee06931faa97e3da988146210afecfbb6a0304497fd0684f12f1
@@ -22,7 +22,8 @@ class TwitterParser
22
22
  favorite_count: get_favorite_count,
23
23
  reply_count: get_reply_count,
24
24
  mention_names: get_mentions[0],
25
- mention_uids: get_mentions[1]
25
+ mention_uids: get_mentions[1],
26
+ time_collected: Time.now
26
27
  }
27
28
  end
28
29
  end
@@ -2,14 +2,14 @@ require 'requestmanager'
2
2
  require 'selenium-webdriver'
3
3
  require 'pry'
4
4
  require 'nokogiri'
5
+ require 'curb'
5
6
 
6
7
  load 'twitter_parser.rb'
7
8
 
8
9
  class TwitterCrawler
9
- def initialize(search_term, operator, requests, cm_hash)
10
+ def initialize(search_term, operator, cm_hash)
10
11
  @search_term = search_term
11
12
  @operator = operator
12
- @requests = requests
13
13
  @output = Array.new
14
14
 
15
15
  # Handle crawler manager info
@@ -26,45 +26,56 @@ class TwitterCrawler
26
26
  end
27
27
  end
28
28
 
29
- def crawl
30
- @requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
31
- scroll_down(0)
32
- get_tweets
33
- @requests.close_all_browsers
29
+ # Parse the tweets into html
30
+ def parse_tweets(tweets)
31
+ return tweets.map do |tweet|
32
+ parser = TwitterParser.new(tweet.to_html)
33
+ parser.parse_tweet
34
+ end
34
35
  end
35
36
 
36
- # Get the tweets on the page
37
- def get_tweets
38
- browser = @requests.get_most_recent_browser[1].first
39
- tweets = browser.find_elements(class: "tweet")
40
-
41
- # Parse each tweet
42
- tweets.each do |tweet|
43
- # Parse tweet
44
- tweet_html = tweet.attribute("innerHTML")
45
- parser = TwitterParser.new(tweet_html)
46
- parsed_tweet = parser.parse_tweet
47
-
48
- # Report results
49
- if parsed_tweet
50
- report_results([parsed_tweet], parsed_tweet[:tweet_link])
51
- end
37
+ # Generate the query url for Twitter
38
+ def gen_query_url(start_tweet, end_tweet)
39
+ # Base query url
40
+ query_url = "https://twitter.com/i/search/timeline?f=tweets&vertical=news&q="+gen_query+"&src=typd&include_available_features=1&include_entities=1"
41
+
42
+ # Gen query URL
43
+ if start_tweet && end_tweet
44
+ query_url += "&max_position=TWEET-"+start_tweet+"-"+end_tweet
52
45
  end
46
+ return query_url
53
47
  end
54
48
 
55
- # Scroll down to the bottom
56
- def scroll_down(last_tweet_num)
57
- # Scroll down to last tweet
58
- browser = @requests.get_most_recent_browser[1].first
59
- tweets = browser.find_elements(class: "tweet")
60
- tweets[tweets.length-2].location_once_scrolled_into_view
61
-
62
- # Check if it should be rerun
63
- sleep(1)
64
- tweet_count = browser.find_elements(class: "tweet").length
65
- if tweet_count > last_tweet_num
66
- scroll_down(tweet_count)
67
- end
49
+ # Query tweets
50
+ def query_tweets(start_tweet, end_tweet)
51
+ # Run Query and parse results
52
+ c = Curl::Easy.perform(gen_query_url(start_tweet, end_tweet))
53
+ curl_items = JSON.parse(c.body_str)
54
+ tweets = Nokogiri::HTML.parse(curl_items["items_html"]).css(".tweet") if curl_items["items_html"]
55
+
56
+ # Save results
57
+ parsed_tweets = parse_tweets(tweets)
58
+ report_results(parsed_tweets, "Saving "+parsed_tweets.length.to_s+" tweets")
59
+
60
+ # Recurse when needed
61
+ if !parsed_tweets.empty?
62
+ start_tweet, end_tweet = get_tweet_range(parsed_tweets, end_tweet)
63
+ query_tweets(start_tweet, end_tweet)
64
+ end
65
+ end
66
+
67
+ # Get the ID for a tweet
68
+ def get_tweet_id(tweet)
69
+ return tweet[:tweet_link].split("/").last
70
+ end
71
+
72
+ # Get start and end tweets
73
+ def get_tweet_range(parsed_tweets, end_tweet)
74
+ if end_tweet # Keeep latest tweet as same
75
+ return get_tweet_id(parsed_tweets.last), end_tweet
76
+ else # Get updated start tweet
77
+ return get_tweet_id(parsed_tweets.last), get_tweet_id(parsed_tweets.first)
78
+ end
68
79
  end
69
80
 
70
81
  # Figure out how to report results
@@ -98,3 +109,4 @@ class TwitterCrawler
98
109
  end
99
110
  end
100
111
 
112
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twittercrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-03 00:00:00.000000000 Z
11
+ date: 2017-02-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls Twitter
14
14
  email: shidash@shidash.com