twittercrawler 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/twittercrawler.rb +35 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 25d1d3c957757eeca51247c704e42f9c009a8d09
|
4
|
+
data.tar.gz: 55bcea247d9d2c926b6fdafe51c876de6ed30049
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2bf6e5315fae1c511f63d94dd928c99998d7ae791983837fc9564e5e070ee4ca8afe73c2a5d959855e2e38706688db5e511abf208873bb1981bb3d2341835eb
|
7
|
+
data.tar.gz: 8495e3060c561ee3eb4885a6e8686a64256cd8a85a17562110030eff6c2da66765487f67e72370dd31ae07c350cd4633bdd798756c9abdac4f7addc69aa5908d
|
data/lib/twittercrawler.rb
CHANGED
@@ -6,11 +6,15 @@ require 'nokogiri'
|
|
6
6
|
load 'twitter_parser.rb'
|
7
7
|
|
8
8
|
class TwitterCrawler
|
9
|
-
def initialize(search_term, operator, requests)
|
9
|
+
def initialize(search_term, operator, requests, cm_hash)
|
10
10
|
@search_term = search_term
|
11
11
|
@operator = operator
|
12
12
|
@requests = requests
|
13
13
|
@output = Array.new
|
14
|
+
|
15
|
+
# Handle crawler manager info
|
16
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
17
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
14
18
|
end
|
15
19
|
|
16
20
|
# Generate advanced query
|
@@ -36,9 +40,13 @@ class TwitterCrawler
|
|
36
40
|
|
37
41
|
# Parse each tweet
|
38
42
|
tweets.each do |tweet|
|
43
|
+
# Add tweet
|
39
44
|
tweet_html = tweet.attribute("innerHTML")
|
40
45
|
parser = TwitterParser.new(tweet_html)
|
41
|
-
|
46
|
+
parsed_tweet = parser.parse_tweet
|
47
|
+
|
48
|
+
# Report results
|
49
|
+
report_results([pared_tweet], parsed_tweet[:tweet_link])
|
42
50
|
end
|
43
51
|
end
|
44
52
|
|
@@ -57,6 +65,31 @@ class TwitterCrawler
|
|
57
65
|
end
|
58
66
|
end
|
59
67
|
|
68
|
+
# Figure out how to report results
|
69
|
+
def report_results(results, link)
|
70
|
+
if @cm_url
|
71
|
+
report_incremental(results, link)
|
72
|
+
else
|
73
|
+
report_batch(results)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Report all results in one JSON
|
78
|
+
def report_batch(results)
|
79
|
+
results.each do |result|
|
80
|
+
@output.push(result)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Report results back to Harvester incrementally
|
85
|
+
def report_incremental(results, link)
|
86
|
+
curl_url = @cm_url+"/relay_results"
|
87
|
+
c = Curl::Easy.http_post(curl_url,
|
88
|
+
Curl::PostField.content('selector_id', @selector_id),
|
89
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
90
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
91
|
+
end
|
92
|
+
|
60
93
|
# Generate JSON for output
|
61
94
|
def gen_json
|
62
95
|
JSON.pretty_generate(@output)
|