extraloop 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +1,37 @@
1
- require 'pry'
2
- class JsonExtractor < ExtractorBase
1
+ module ExtraLoop
2
+ class JsonExtractor < ExtractorBase
3
3
 
4
- def initialize(*args)
5
- @path = args[2] && args[2].is_a?(Array) ? args[2] : nil
6
- super(*args)
7
- end
4
+ def initialize(*args)
5
+ @path = args[2] && args[2].is_a?(Array) ? args[2] : nil
6
+ super(*args)
7
+ end
8
8
 
9
- def extract_field(node, record=nil)
10
- output = node = node.is_a?(String) ? parse(node) : node
11
- output = node.get_in(@path) if @path
12
- output = node[@attribute.to_s] if @attribute
13
- output = @environment.run(output, record, &@callback) if @callback
9
+ def extract_field(node, record=nil)
10
+ output = node = node.is_a?(String) ? parse(node) : node
11
+ output = node.get_in(@path) if @path
12
+ output = node[@attribute.to_s] if @attribute
13
+ output = @environment.run(output, record, &@callback) if @callback
14
14
 
15
- # when no attribute and no callback is provided, try fetching by field name
16
- if !@attribute && !@callback
17
- output = node[@field_name.to_s] if node[@field_name.to_s]
15
+ # when no attribute and no callback is provided, try fetching by field name
16
+ if !@attribute && !@callback
17
+ output = node[@field_name.to_s] if node[@field_name.to_s]
18
+ end
19
+ output
18
20
  end
19
- output
20
- end
21
21
 
22
- def extract_list(input)
23
- #TODO: implement more clever stuff here after looking
24
- # into possible hash traversal techniques
22
+ def extract_list(input)
23
+ #TODO: implement more clever stuff here after looking
24
+ # into possible hash traversal techniques
25
25
 
26
- input = input.is_a?(String) ? parse(input) : input
27
- input = input.get_in(@path) if @path
26
+ input = input.is_a?(String) ? parse(input) : input
27
+ input = input.get_in(@path) if @path
28
28
 
29
- @callback && Array(@environment.run(input, &@callback)) || input
30
- end
29
+ @callback && Array(@environment.run(input, &@callback)) || input
30
+ end
31
31
 
32
- def parse(input)
33
- super(input)
34
- @environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
32
+ def parse(input)
33
+ super(input)
34
+ @environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
35
+ end
35
36
  end
36
37
  end
@@ -1,64 +1,65 @@
1
1
  autoload :Logging, "logging"
2
2
 
3
- # Decorates a class with an instance of Logging.logger and a convenient
4
- # helper method to log messages.
3
+ module ExtraLoop
4
+ # Decorates a class with an instance of Logging.logger and a convenient
5
+ # helper method to log messages.
5
6
 
6
- module Loggable
7
- protected
7
+ module Loggable
8
+ protected
8
9
 
9
- #
10
- # Initializes the incorporated logger object.
11
- #
12
- # Returns nothing.
13
- #
10
+ #
11
+ # Initializes the incorporated logger object.
12
+ #
13
+ # Returns nothing.
14
+ #
14
15
 
15
- def init_log!
16
- return unless @options[:log]
16
+ def init_log!
17
+ return unless @options[:log]
17
18
 
18
- @options[:log] ||= {
19
- :appenders => [ Logging.appenders.stderr ],
20
- :log_level => :info
21
- }
19
+ @options[:log] ||= {
20
+ :appenders => [ Logging.appenders.stderr ],
21
+ :log_level => :info
22
+ }
22
23
 
23
- if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
24
- @log = Logging.logger["#{self}"]
25
- @log.add_appenders(@options[:log][:appenders])
26
- @log.level = @options[:log] && @options[:log][:log_level] || :info
24
+ if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
25
+ @log = Logging.logger["#{self}"]
26
+ @log.add_appenders(@options[:log][:appenders])
27
+ @log.level = @options[:log] && @options[:log][:log_level] || :info
28
+ end
27
29
  end
28
- end
29
30
 
30
- #
31
- # Convenience method for logging messages.
32
- #
33
- # messages - the message content
34
- # log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
35
- #
36
- # Returns nothing.
37
- #
31
+ #
32
+ # Convenience method for logging messages.
33
+ #
34
+ # messages - the message content
35
+ # log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
36
+ #
37
+ # Returns nothing.
38
+ #
38
39
 
39
- def log(message, log_level = :info)
40
- @log.send(log_level, message) if @log
40
+ def log(message, log_level = :info)
41
+ @log.send(log_level, message) if @log
42
+ end
41
43
  end
42
- end
43
-
44
-
45
- #
46
- # Monkey patches ScraperBase.
47
- #
48
- class ScraperBase
49
- include Loggable
50
- alias_method :base_initialize, :initialize
51
44
 
45
+ #
46
+ # Monkey patches ScraperBase.
52
47
  #
53
- # Wrapp ScraperBase#initialize method into Loggable#initialize
54
- #
55
- # args - The arguments to be passed over to the ScraperBase#initialize method.
56
- #
57
- # Returns itself.
58
- #
59
- def initialize(*args)
60
- base_initialize(*args)
61
- init_log!
62
- self
48
+ class ScraperBase
49
+ include Loggable
50
+ alias_method :base_initialize, :initialize
51
+
52
+ #
53
+ # Wrapp ScraperBase#initialize method into Loggable#initialize
54
+ #
55
+ # args - The arguments to be passed over to the ScraperBase#initialize method.
56
+ #
57
+ # Returns itself.
58
+ #
59
+ def initialize(*args)
60
+ base_initialize(*args)
61
+ init_log!
62
+ self
63
+ end
63
64
  end
64
65
  end
@@ -1,166 +1,169 @@
1
- class ScraperBase
2
- include Hookable
3
- include Utils::Support
4
-
5
- attr_reader :results, :options
6
-
7
- #
8
- # Public: Initalizes a web scraper.
9
- #
10
- # urls - One or several urls.
11
- # options - Hash of scraper options
12
- # async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
13
- # log : logging options (defaults to standard error).
14
- # appenders : specifies where the log messages should be appended to (defaults to standard error).
15
- # log_level : specifies the log level (defaults to info).
16
- # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
17
- #
18
- #
19
- #
20
- # Returns itself.
21
- #
22
-
23
- def initialize(urls, options = {}, arguments = {})
24
- @urls = Array(urls)
25
- @loop_extractor_args = nil
26
- @extractor_args = []
27
- @loop = nil
28
-
29
- @request_arguments = arguments
30
-
31
- @options = {
32
- :async => false
33
- }.merge(options)
34
-
35
-
36
- @response_count = 0
37
- @queued_count = 0
38
-
39
- @hooks = {}
40
- @failed_requests = []
41
-
42
- hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
43
- @hydra = Typhoeus::Hydra.new hydra_options
44
- self
45
- end
1
+ module ExtraLoop
2
+ class ScraperBase
3
+ include Hookable
4
+ include Utils::Support
5
+
6
+ attr_reader :results, :options
7
+
8
+ #
9
+ # Public: Initalizes a web scraper.
10
+ #
11
+ # urls - One or several urls.
12
+ # options - Hash of scraper options
13
+ # async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
14
+ # log : logging options (defaults to standard error).
15
+ # appenders : specifies where the log messages should be appended to (defaults to standard error).
16
+ # log_level : specifies the log level (defaults to info).
17
+ # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
18
+ #
19
+ #
20
+ #
21
+ # Returns itself.
22
+ #
23
+
24
+ def initialize(urls, options = {}, arguments = {})
25
+ @urls = Array(urls)
26
+ @loop_extractor_args = nil
27
+ @extractor_args = []
28
+ @loop = nil
29
+
30
+ @request_arguments = arguments
31
+
32
+ @options = {
33
+ :async => false
34
+ }.merge(options)
35
+
36
+
37
+ @response_count = 0
38
+ @queued_count = 0
39
+
40
+ @hooks = {}
41
+ @failed_requests = []
42
+
43
+ hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
44
+ @hydra = Typhoeus::Hydra.new hydra_options
45
+ self
46
+ end
46
47
 
47
48
 
48
- # Public: Sets the scraper extraction loop.
49
- #
50
- # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
51
- #
52
- #
53
- # selector - The CSS3 selector identifying the node list over which iterate (optional).
54
- # callback - A block of code (optional).
55
- # attribute - An attribute name (optional).
56
- #
57
- # Returns itself.
58
- #
59
-
60
- def loop_on(*args)
61
- @loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
62
- self
63
- end
49
+ # Public: Sets the scraper extraction loop.
50
+ #
51
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
52
+ #
53
+ #
54
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
55
+ # callback - A block of code (optional).
56
+ # attribute - An attribute name (optional).
57
+ #
58
+ # Returns itself.
59
+ #
60
+
61
+ def loop_on(*args)
62
+ @loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
63
+ self
64
+ end
64
65
 
65
- # Public: Registers a new extractor to be added to the loop.
66
- #
67
- # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
68
- #
69
- # selector - The CSS3 selector identifying the node list over which iterate (optional).
70
- # callback - A block of code (optional).
71
- # attribute - An attribute name (optional).
72
- #
73
- # Returns itself.
74
- #
75
- #
76
-
77
- def extract(*args)
78
- @extractor_args << args.insert(1, ExtractionEnvironment.new(self))
79
- self
80
- end
66
+ # Public: Registers a new extractor to be added to the loop.
67
+ #
68
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
69
+ #
70
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
71
+ # callback - A block of code (optional).
72
+ # attribute - An attribute name (optional).
73
+ #
74
+ # Returns itself.
75
+ #
76
+ #
77
+
78
+ def extract(*args)
79
+ @extractor_args << args.insert(1, ExtractionEnvironment.new(self))
80
+ self
81
+ end
81
82
 
82
- #
83
- # Public: Runs the main scraping loop.
84
- #
85
- # Returns nothing
86
- #
87
- def run
88
- @urls.each do |url|
89
- issue_request(url)
90
-
91
- # if the scraper is asynchronous start processing the Hydra HTTP queue
92
- # only after that the last url has been appended to the queue (see #issue_request).
93
- #
94
- if @options[:async]
95
- if url == @urls.last
83
+ #
84
+ # Public: Runs the main scraping loop.
85
+ #
86
+ # Returns nothing
87
+ #
88
+ def run
89
+ @urls.each do |url|
90
+ issue_request(url)
91
+
92
+ # if the scraper is asynchronous start processing the Hydra HTTP queue
93
+ # only after that the last url has been appended to the queue (see #issue_request).
94
+ #
95
+ if @options[:async]
96
+ if url == @urls.last
97
+ @hydra.run
98
+ end
99
+ else
96
100
  @hydra.run
97
101
  end
98
- else
99
- @hydra.run
100
102
  end
103
+ self
101
104
  end
102
- self
103
- end
104
105
 
105
- protected
106
+ protected
106
107
 
107
- def issue_request(url)
108
+ def issue_request(url)
108
109
 
109
- @request_arguments[:params] = merge_request_parameters(url)
110
- url_without_params = url.gsub(/\?.*/,"")
110
+ @request_arguments[:params] = merge_request_parameters(url)
111
+ url_without_params = url.gsub(/\?.*/,"")
111
112
 
112
- arguments = {
113
- 'headers' => [
114
- 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
115
- 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
116
- ].join("\n")
117
- }
113
+ arguments = {
114
+ 'headers' => [
115
+ 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
116
+ 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
117
+ ].join("\n")
118
+ }
118
119
 
119
- arguments.merge!(@request_arguments)
120
- request = Typhoeus::Request.new(*[url_without_params, arguments])
120
+ arguments.merge!(@request_arguments)
121
+ request = Typhoeus::Request.new(*[url_without_params, arguments])
121
122
 
122
- request.on_complete do |response|
123
- handle_response(response)
124
- end
123
+ request.on_complete do |response|
124
+ handle_response(response)
125
+ end
125
126
 
126
- log("queueing url: #{url}, params #{arguments[:params]}", :debug)
127
- @queued_count += 1
128
- @hydra.queue(request)
129
- end
127
+ log("queueing url: #{url}, params #{arguments[:params]}", :debug)
128
+ @queued_count += 1
129
+ @hydra.queue(request)
130
+ end
130
131
 
131
- def merge_request_parameters(url)
132
- url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
133
- return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
132
+ def merge_request_parameters(url)
133
+ url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
134
+ return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
134
135
 
135
- params = symbolize_keys(@request_arguments[:params] ||= {})
136
- url_params.merge(params)
137
- end
136
+ params = symbolize_keys(@request_arguments[:params] ||= {})
137
+ url_params.merge(params)
138
+ end
138
139
 
139
- def handle_response(response)
140
- @response_count += 1
141
- @loop = prepare_loop(response)
142
- log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
143
- @loop.run
140
+ def handle_response(response)
141
+ @response_count += 1
142
+ @loop = prepare_loop(response)
143
+ log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
144
+ @loop.run
144
145
 
145
- @environment = @loop.environment
146
- run_hook(:data, [@loop.records, response])
147
- end
146
+ @environment = @loop.environment
147
+ run_hook(:data, [@loop.records, response])
148
+ end
148
149
 
149
- def prepare_loop(response)
150
- format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
151
- extractor_class = format == :json ? JsonExtractor : DomExtractor
152
- loop_extractor = extractor_class.new(*@loop_extractor_args)
153
- extractors = @extractor_args.map { |args| extractor_class.new(*args) }
154
- ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
155
- end
150
+ def prepare_loop(response)
151
+ format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
152
+ extractor_class = format == :json ? JsonExtractor : DomExtractor
153
+ loop_extractor = extractor_class.new(*@loop_extractor_args)
154
+ extractors = @extractor_args.map { |args| extractor_class.new(*args) }
155
+ ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
156
+ end
156
157
 
157
- def detect_format(content_type)
158
- #TODO: add support for xml/rdf documents
159
- if content_type && content_type =~ /json$/
160
- :json
161
- else
162
- :html
158
+ def detect_format(content_type)
159
+ #TODO: add support for xml/rdf documents
160
+ if content_type && content_type =~ /json$/
161
+ :json
162
+ else
163
+ :html
164
+ end
163
165
  end
166
+
164
167
  end
165
168
 
166
169
  end