twittercrawler 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/twittercrawler.rb +35 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 25d1d3c957757eeca51247c704e42f9c009a8d09
|
4
|
+
data.tar.gz: 55bcea247d9d2c926b6fdafe51c876de6ed30049
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2bf6e5315fae1c511f63d94dd928c99998d7ae791983837fc9564e5e070ee4ca8afe73c2a5d959855e2e38706688db5e511abf208873bb1981bb3d2341835eb
|
7
|
+
data.tar.gz: 8495e3060c561ee3eb4885a6e8686a64256cd8a85a17562110030eff6c2da66765487f67e72370dd31ae07c350cd4633bdd798756c9abdac4f7addc69aa5908d
|
data/lib/twittercrawler.rb
CHANGED
@@ -6,11 +6,15 @@ require 'nokogiri'
|
|
6
6
|
load 'twitter_parser.rb'
|
7
7
|
|
8
8
|
class TwitterCrawler
|
9
|
-
def initialize(search_term, operator, requests)
|
9
|
+
def initialize(search_term, operator, requests, cm_hash)
|
10
10
|
@search_term = search_term
|
11
11
|
@operator = operator
|
12
12
|
@requests = requests
|
13
13
|
@output = Array.new
|
14
|
+
|
15
|
+
# Handle crawler manager info
|
16
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
17
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
14
18
|
end
|
15
19
|
|
16
20
|
# Generate advanced query
|
@@ -36,9 +40,13 @@ class TwitterCrawler
|
|
36
40
|
|
37
41
|
# Parse each tweet
|
38
42
|
tweets.each do |tweet|
|
43
|
+
# Add tweet
|
39
44
|
tweet_html = tweet.attribute("innerHTML")
|
40
45
|
parser = TwitterParser.new(tweet_html)
|
41
|
-
|
46
|
+
parsed_tweet = parser.parse_tweet
|
47
|
+
|
48
|
+
# Report results
|
49
|
+
report_results([pared_tweet], parsed_tweet[:tweet_link])
|
42
50
|
end
|
43
51
|
end
|
44
52
|
|
@@ -57,6 +65,31 @@ class TwitterCrawler
|
|
57
65
|
end
|
58
66
|
end
|
59
67
|
|
68
|
+
# Figure out how to report results
|
69
|
+
def report_results(results, link)
|
70
|
+
if @cm_url
|
71
|
+
report_incremental(results, link)
|
72
|
+
else
|
73
|
+
report_batch(results)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Report all results in one JSON
|
78
|
+
def report_batch(results)
|
79
|
+
results.each do |result|
|
80
|
+
@output.push(result)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Report results back to Harvester incrementally
|
85
|
+
def report_incremental(results, link)
|
86
|
+
curl_url = @cm_url+"/relay_results"
|
87
|
+
c = Curl::Easy.http_post(curl_url,
|
88
|
+
Curl::PostField.content('selector_id', @selector_id),
|
89
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
90
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
91
|
+
end
|
92
|
+
|
60
93
|
# Generate JSON for output
|
61
94
|
def gen_json
|
62
95
|
JSON.pretty_generate(@output)
|