url_processor 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -51,19 +51,23 @@ module UrlProcessor
51
51
  # Output progress information
52
52
  config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow
53
53
 
54
+ yield group
55
+
54
56
  # for debuggin purposes we do not want to process everything
55
57
  if config.debug && processed_links >= config.batch_size
56
58
  config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
57
59
  return
58
60
  end
59
61
 
60
- group.each do |element|
61
- yield element
62
- end
63
62
  end
64
63
  else
64
+ elements = []
65
65
  collection.each do |element|
66
- yield element
66
+ elements << element
67
+ if elements.size % batch_size == 0
68
+ yield elements
69
+ elements = elements.clear
70
+ end
67
71
  end
68
72
  end
69
73
  end
@@ -72,53 +76,56 @@ module UrlProcessor
72
76
  processed_links = 0
73
77
  hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)
74
78
 
75
- find_in_batches(config.links.call, config.batch_size) do |link|
76
- # any custom pre-processing
77
- pre_process_link(link)
78
-
79
- if link.urls.empty?
80
- # In the event that we have a link that actually has no urls associated with it
81
- report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
82
- else
83
- # Each record has 2 urls associated with it, process each separately
84
- link.urls.each do |url|
85
- config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
86
-
87
- link_request = config.new_link_request.call(
88
- url[:url],
89
- followlocation: true,
90
- method: :head,
91
- ssl_verifypeer: false,
92
- ssl_verifyhost: 2,
93
- cookiefile: config.cookies_file,
94
- cookiejar: config.cookies_file,
95
- link_id: link.id,
96
- url_type_code: url[:url_type_code],
97
- timeout: config.max_timeout,
98
- connecttimeout: config.max_timeout,
99
- max_retries: config.max_retries,
100
- forbid_reuse: 1,
101
- nosignal: 1
102
- )
103
-
104
- link_request.on_complete do |response|
105
- processed_links += 1
106
-
107
- if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
108
- config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
109
- hydra.queue response.request
110
- elsif response.return_code == :got_nothing && response.request.options[:method] != :get
111
- config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
112
-
113
- # set to GET request since HEAD may fail in some cases
114
- response.request.options[:method] = :get
115
- hydra.queue response.request
116
- else
117
- config.process_response.call response
79
+ find_in_batches(config.links.call, config.batch_size) do |group|
80
+
81
+ group.each do |link|
82
+ # any custom pre-processing
83
+ pre_process_link(link)
84
+
85
+ if link.urls.empty?
86
+ # In the event that we have a link that actually has no urls associated with it
87
+ report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
88
+ else
89
+ # Each record has 2 urls associated with it, process each separately
90
+ link.urls.each do |url|
91
+ config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
92
+
93
+ link_request = config.new_link_request.call(
94
+ url[:url],
95
+ followlocation: true,
96
+ method: :head,
97
+ ssl_verifypeer: false,
98
+ ssl_verifyhost: 2,
99
+ cookiefile: config.cookies_file,
100
+ cookiejar: config.cookies_file,
101
+ link_id: link.id,
102
+ url_type_code: url[:url_type_code],
103
+ timeout: config.max_timeout,
104
+ connecttimeout: config.max_timeout,
105
+ max_retries: config.max_retries,
106
+ forbid_reuse: 1,
107
+ nosignal: 1
108
+ )
109
+
110
+ link_request.on_complete do |response|
111
+ processed_links += 1
112
+
113
+ if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
114
+ config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
115
+ hydra.queue response.request
116
+ elsif response.return_code == :got_nothing && response.request.options[:method] != :get
117
+ config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
118
+
119
+ # set to GET request since HEAD may fail in some cases
120
+ response.request.options[:method] = :get
121
+ hydra.queue response.request
122
+ else
123
+ config.process_response.call response
124
+ end
118
125
  end
119
- end
120
126
 
121
- hydra.queue link_request
127
+ hydra.queue link_request
128
+ end
122
129
  end
123
130
  end
124
131
 
@@ -1,3 +1,3 @@
1
1
  module UrlProcessor
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-04-29 00:00:00.000000000 Z
12
+ date: 2014-04-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: typhoeus
@@ -186,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
186
186
  version: '0'
187
187
  segments:
188
188
  - 0
189
- hash: -3648591694633088295
189
+ hash: 1450876520014070707
190
190
  required_rubygems_version: !ruby/object:Gem::Requirement
191
191
  none: false
192
192
  requirements:
@@ -195,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
195
195
  version: '0'
196
196
  segments:
197
197
  - 0
198
- hash: -3648591694633088295
198
+ hash: 1450876520014070707
199
199
  requirements: []
200
200
  rubyforge_project:
201
201
  rubygems_version: 1.8.23.2