extraloop 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/README.md +2 -2
- data/examples/google_news_scraper.rb +1 -1
- data/examples/wikipedia_categories.rb +1 -1
- data/lib/extraloop.rb +18 -16
- data/lib/extraloop/dom_extractor.rb +38 -36
- data/lib/extraloop/extraction_environment.rb +16 -14
- data/lib/extraloop/extraction_loop.rb +37 -37
- data/lib/extraloop/extractor_base.rb +34 -33
- data/lib/extraloop/hookable.rb +18 -18
- data/lib/extraloop/iterative_scraper.rb +249 -250
- data/lib/extraloop/json_extractor.rb +27 -26
- data/lib/extraloop/loggable.rb +50 -49
- data/lib/extraloop/scraper_base.rb +144 -141
- data/lib/extraloop/utils.rb +64 -61
- data/spec/helpers/spec_helper.rb +2 -1
- metadata +24 -13
@@ -1,36 +1,37 @@
|
|
1
|
-
|
2
|
-
class JsonExtractor < ExtractorBase
|
1
|
+
module ExtraLoop
|
2
|
+
class JsonExtractor < ExtractorBase
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
def initialize(*args)
|
5
|
+
@path = args[2] && args[2].is_a?(Array) ? args[2] : nil
|
6
|
+
super(*args)
|
7
|
+
end
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
def extract_field(node, record=nil)
|
10
|
+
output = node = node.is_a?(String) ? parse(node) : node
|
11
|
+
output = node.get_in(@path) if @path
|
12
|
+
output = node[@attribute.to_s] if @attribute
|
13
|
+
output = @environment.run(output, record, &@callback) if @callback
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
# when no attribute and no callback is provided, try fetching by field name
|
16
|
+
if !@attribute && !@callback
|
17
|
+
output = node[@field_name.to_s] if node[@field_name.to_s]
|
18
|
+
end
|
19
|
+
output
|
18
20
|
end
|
19
|
-
output
|
20
|
-
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
def extract_list(input)
|
23
|
+
#TODO: implement more clever stuff here after looking
|
24
|
+
# into possible hash traversal techniques
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
input = input.is_a?(String) ? parse(input) : input
|
27
|
+
input = input.get_in(@path) if @path
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
@callback && Array(@environment.run(input, &@callback)) || input
|
30
|
+
end
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
def parse(input)
|
33
|
+
super(input)
|
34
|
+
@environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
|
35
|
+
end
|
35
36
|
end
|
36
37
|
end
|
data/lib/extraloop/loggable.rb
CHANGED
@@ -1,64 +1,65 @@
|
|
1
1
|
autoload :Logging, "logging"
|
2
2
|
|
3
|
-
|
4
|
-
#
|
3
|
+
module ExtraLoop
|
4
|
+
# Decorates a class with an instance of Logging.logger and a convenient
|
5
|
+
# helper method to log messages.
|
5
6
|
|
6
|
-
module Loggable
|
7
|
-
|
7
|
+
module Loggable
|
8
|
+
protected
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
#
|
11
|
+
# Initializes the incorporated logger object.
|
12
|
+
#
|
13
|
+
# Returns nothing.
|
14
|
+
#
|
14
15
|
|
15
|
-
|
16
|
-
|
16
|
+
def init_log!
|
17
|
+
return unless @options[:log]
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
@options[:log] ||= {
|
20
|
+
:appenders => [ Logging.appenders.stderr ],
|
21
|
+
:log_level => :info
|
22
|
+
}
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
|
25
|
+
@log = Logging.logger["#{self}"]
|
26
|
+
@log.add_appenders(@options[:log][:appenders])
|
27
|
+
@log.level = @options[:log] && @options[:log][:log_level] || :info
|
28
|
+
end
|
27
29
|
end
|
28
|
-
end
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
#
|
32
|
+
# Convenience method for logging messages.
|
33
|
+
#
|
34
|
+
# messages - the message content
|
35
|
+
# log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
|
36
|
+
#
|
37
|
+
# Returns nothing.
|
38
|
+
#
|
38
39
|
|
39
|
-
|
40
|
-
|
40
|
+
def log(message, log_level = :info)
|
41
|
+
@log.send(log_level, message) if @log
|
42
|
+
end
|
41
43
|
end
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
#
|
46
|
-
# Monkey patches ScraperBase.
|
47
|
-
#
|
48
|
-
class ScraperBase
|
49
|
-
include Loggable
|
50
|
-
alias_method :base_initialize, :initialize
|
51
44
|
|
45
|
+
#
|
46
|
+
# Monkey patches ScraperBase.
|
52
47
|
#
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
48
|
+
class ScraperBase
|
49
|
+
include Loggable
|
50
|
+
alias_method :base_initialize, :initialize
|
51
|
+
|
52
|
+
#
|
53
|
+
# Wrapp ScraperBase#initialize method into Loggable#initialize
|
54
|
+
#
|
55
|
+
# args - The arguments to be passed over to the ScraperBase#initialize method.
|
56
|
+
#
|
57
|
+
# Returns itself.
|
58
|
+
#
|
59
|
+
def initialize(*args)
|
60
|
+
base_initialize(*args)
|
61
|
+
init_log!
|
62
|
+
self
|
63
|
+
end
|
63
64
|
end
|
64
65
|
end
|
@@ -1,166 +1,169 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
1
|
+
module ExtraLoop
|
2
|
+
class ScraperBase
|
3
|
+
include Hookable
|
4
|
+
include Utils::Support
|
5
|
+
|
6
|
+
attr_reader :results, :options
|
7
|
+
|
8
|
+
#
|
9
|
+
# Public: Initalizes a web scraper.
|
10
|
+
#
|
11
|
+
# urls - One or several urls.
|
12
|
+
# options - Hash of scraper options
|
13
|
+
# async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
|
14
|
+
# log : logging options (defaults to standard error).
|
15
|
+
# appenders : specifies where the log messages should be appended to (defaults to standard error).
|
16
|
+
# log_level : specifies the log level (defaults to info).
|
17
|
+
# arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
|
18
|
+
#
|
19
|
+
#
|
20
|
+
#
|
21
|
+
# Returns itself.
|
22
|
+
#
|
23
|
+
|
24
|
+
def initialize(urls, options = {}, arguments = {})
|
25
|
+
@urls = Array(urls)
|
26
|
+
@loop_extractor_args = nil
|
27
|
+
@extractor_args = []
|
28
|
+
@loop = nil
|
29
|
+
|
30
|
+
@request_arguments = arguments
|
31
|
+
|
32
|
+
@options = {
|
33
|
+
:async => false
|
34
|
+
}.merge(options)
|
35
|
+
|
36
|
+
|
37
|
+
@response_count = 0
|
38
|
+
@queued_count = 0
|
39
|
+
|
40
|
+
@hooks = {}
|
41
|
+
@failed_requests = []
|
42
|
+
|
43
|
+
hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
|
44
|
+
@hydra = Typhoeus::Hydra.new hydra_options
|
45
|
+
self
|
46
|
+
end
|
46
47
|
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
49
|
+
# Public: Sets the scraper extraction loop.
|
50
|
+
#
|
51
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
52
|
+
#
|
53
|
+
#
|
54
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
55
|
+
# callback - A block of code (optional).
|
56
|
+
# attribute - An attribute name (optional).
|
57
|
+
#
|
58
|
+
# Returns itself.
|
59
|
+
#
|
60
|
+
|
61
|
+
def loop_on(*args)
|
62
|
+
@loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
|
63
|
+
self
|
64
|
+
end
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
66
|
+
# Public: Registers a new extractor to be added to the loop.
|
67
|
+
#
|
68
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
69
|
+
#
|
70
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
71
|
+
# callback - A block of code (optional).
|
72
|
+
# attribute - An attribute name (optional).
|
73
|
+
#
|
74
|
+
# Returns itself.
|
75
|
+
#
|
76
|
+
#
|
77
|
+
|
78
|
+
def extract(*args)
|
79
|
+
@extractor_args << args.insert(1, ExtractionEnvironment.new(self))
|
80
|
+
self
|
81
|
+
end
|
81
82
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
83
|
+
#
|
84
|
+
# Public: Runs the main scraping loop.
|
85
|
+
#
|
86
|
+
# Returns nothing
|
87
|
+
#
|
88
|
+
def run
|
89
|
+
@urls.each do |url|
|
90
|
+
issue_request(url)
|
91
|
+
|
92
|
+
# if the scraper is asynchronous start processing the Hydra HTTP queue
|
93
|
+
# only after that the last url has been appended to the queue (see #issue_request).
|
94
|
+
#
|
95
|
+
if @options[:async]
|
96
|
+
if url == @urls.last
|
97
|
+
@hydra.run
|
98
|
+
end
|
99
|
+
else
|
96
100
|
@hydra.run
|
97
101
|
end
|
98
|
-
else
|
99
|
-
@hydra.run
|
100
102
|
end
|
103
|
+
self
|
101
104
|
end
|
102
|
-
self
|
103
|
-
end
|
104
105
|
|
105
|
-
|
106
|
+
protected
|
106
107
|
|
107
|
-
|
108
|
+
def issue_request(url)
|
108
109
|
|
109
|
-
|
110
|
-
|
110
|
+
@request_arguments[:params] = merge_request_parameters(url)
|
111
|
+
url_without_params = url.gsub(/\?.*/,"")
|
111
112
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
113
|
+
arguments = {
|
114
|
+
'headers' => [
|
115
|
+
'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
|
116
|
+
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
117
|
+
].join("\n")
|
118
|
+
}
|
118
119
|
|
119
|
-
|
120
|
-
|
120
|
+
arguments.merge!(@request_arguments)
|
121
|
+
request = Typhoeus::Request.new(*[url_without_params, arguments])
|
121
122
|
|
122
|
-
|
123
|
-
|
124
|
-
|
123
|
+
request.on_complete do |response|
|
124
|
+
handle_response(response)
|
125
|
+
end
|
125
126
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
127
|
+
log("queueing url: #{url}, params #{arguments[:params]}", :debug)
|
128
|
+
@queued_count += 1
|
129
|
+
@hydra.queue(request)
|
130
|
+
end
|
130
131
|
|
131
|
-
|
132
|
-
|
133
|
-
|
132
|
+
def merge_request_parameters(url)
|
133
|
+
url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
|
134
|
+
return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
|
134
135
|
|
135
|
-
|
136
|
-
|
137
|
-
|
136
|
+
params = symbolize_keys(@request_arguments[:params] ||= {})
|
137
|
+
url_params.merge(params)
|
138
|
+
end
|
138
139
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
140
|
+
def handle_response(response)
|
141
|
+
@response_count += 1
|
142
|
+
@loop = prepare_loop(response)
|
143
|
+
log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
|
144
|
+
@loop.run
|
144
145
|
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
@environment = @loop.environment
|
147
|
+
run_hook(:data, [@loop.records, response])
|
148
|
+
end
|
148
149
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
150
|
+
def prepare_loop(response)
|
151
|
+
format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
|
152
|
+
extractor_class = format == :json ? JsonExtractor : DomExtractor
|
153
|
+
loop_extractor = extractor_class.new(*@loop_extractor_args)
|
154
|
+
extractors = @extractor_args.map { |args| extractor_class.new(*args) }
|
155
|
+
ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
|
156
|
+
end
|
156
157
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
158
|
+
def detect_format(content_type)
|
159
|
+
#TODO: add support for xml/rdf documents
|
160
|
+
if content_type && content_type =~ /json$/
|
161
|
+
:json
|
162
|
+
else
|
163
|
+
:html
|
164
|
+
end
|
163
165
|
end
|
166
|
+
|
164
167
|
end
|
165
168
|
|
166
169
|
end
|