twittercrawler 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/twitter_parser.rb +2 -1
- data/lib/twittercrawler.rb +48 -36
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b2a8f686154fcb64246ed8c3bef02f324df3892
|
4
|
+
data.tar.gz: f8219382c56cfacad13e7fbd55a25e156280976f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f21e73e1bd854174182fee749153be4499c2b2f6742ae6ca09aa1e85e2d100e9eab9e2af3898400aa0f0bb6d9b071292cf4393d80b45cfaf37d0e22780a0834b
|
7
|
+
data.tar.gz: 7a8c376ffcaaeb0ae55964251dd7a0efaf5bece5fd81a03c99eed231784bc29d1f5c02855b31ee06931faa97e3da988146210afecfbb6a0304497fd0684f12f1
|
data/lib/twitter_parser.rb
CHANGED
data/lib/twittercrawler.rb
CHANGED
@@ -2,14 +2,14 @@ require 'requestmanager'
|
|
2
2
|
require 'selenium-webdriver'
|
3
3
|
require 'pry'
|
4
4
|
require 'nokogiri'
|
5
|
+
require 'curb'
|
5
6
|
|
6
7
|
load 'twitter_parser.rb'
|
7
8
|
|
8
9
|
class TwitterCrawler
|
9
|
-
def initialize(search_term, operator,
|
10
|
+
def initialize(search_term, operator, cm_hash)
|
10
11
|
@search_term = search_term
|
11
12
|
@operator = operator
|
12
|
-
@requests = requests
|
13
13
|
@output = Array.new
|
14
14
|
|
15
15
|
# Handle crawler manager info
|
@@ -26,45 +26,56 @@ class TwitterCrawler
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
# Parse the tweets into html
|
30
|
+
def parse_tweets(tweets)
|
31
|
+
return tweets.map do |tweet|
|
32
|
+
parser = TwitterParser.new(tweet.to_html)
|
33
|
+
parser.parse_tweet
|
34
|
+
end
|
34
35
|
end
|
35
36
|
|
36
|
-
#
|
37
|
-
def
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
tweet_html = tweet.attribute("innerHTML")
|
45
|
-
parser = TwitterParser.new(tweet_html)
|
46
|
-
parsed_tweet = parser.parse_tweet
|
47
|
-
|
48
|
-
# Report results
|
49
|
-
if parsed_tweet
|
50
|
-
report_results([parsed_tweet], parsed_tweet[:tweet_link])
|
51
|
-
end
|
37
|
+
# Generate the query url for Twitter
|
38
|
+
def gen_query_url(start_tweet, end_tweet)
|
39
|
+
# Base query url
|
40
|
+
query_url = "https://twitter.com/i/search/timeline?f=tweets&vertical=news&q="+gen_query+"&src=typd&include_available_features=1&include_entities=1"
|
41
|
+
|
42
|
+
# Gen query URL
|
43
|
+
if start_tweet && end_tweet
|
44
|
+
query_url += "&max_position=TWEET-"+start_tweet+"-"+end_tweet
|
52
45
|
end
|
46
|
+
return query_url
|
53
47
|
end
|
54
48
|
|
55
|
-
#
|
56
|
-
def
|
57
|
-
#
|
58
|
-
|
59
|
-
|
60
|
-
tweets[
|
61
|
-
|
62
|
-
#
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
49
|
+
# Query tweets
|
50
|
+
def query_tweets(start_tweet, end_tweet)
|
51
|
+
# Run Query and parse results
|
52
|
+
c = Curl::Easy.perform(gen_query_url(start_tweet, end_tweet))
|
53
|
+
curl_items = JSON.parse(c.body_str)
|
54
|
+
tweets = Nokogiri::HTML.parse(curl_items["items_html"]).css(".tweet") if curl_items["items_html"]
|
55
|
+
|
56
|
+
# Save results
|
57
|
+
parsed_tweets = parse_tweets(tweets)
|
58
|
+
report_results(parsed_tweets, "Saving "+parsed_tweets.length.to_s+" tweets")
|
59
|
+
|
60
|
+
# Recurse when needed
|
61
|
+
if !parsed_tweets.empty?
|
62
|
+
start_tweet, end_tweet = get_tweet_range(parsed_tweets, end_tweet)
|
63
|
+
query_tweets(start_tweet, end_tweet)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get the ID for a tweet
|
68
|
+
def get_tweet_id(tweet)
|
69
|
+
return tweet[:tweet_link].split("/").last
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get start and end tweets
|
73
|
+
def get_tweet_range(parsed_tweets, end_tweet)
|
74
|
+
if end_tweet # Keeep latest tweet as same
|
75
|
+
return get_tweet_id(parsed_tweets.last), end_tweet
|
76
|
+
else # Get updated start tweet
|
77
|
+
return get_tweet_id(parsed_tweets.last), get_tweet_id(parsed_tweets.first)
|
78
|
+
end
|
68
79
|
end
|
69
80
|
|
70
81
|
# Figure out how to report results
|
@@ -98,3 +109,4 @@ class TwitterCrawler
|
|
98
109
|
end
|
99
110
|
end
|
100
111
|
|
112
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twittercrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls Twitter
|
14
14
|
email: shidash@shidash.com
|