url_processor 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,19 +51,23 @@ module UrlProcessor
51
51
  # Output progress information
52
52
  config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow
53
53
 
54
+ yield group
55
+
54
56
  # for debuggin purposes we do not want to process everything
55
57
  if config.debug && processed_links >= config.batch_size
56
58
  config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
57
59
  return
58
60
  end
59
61
 
60
- group.each do |element|
61
- yield element
62
- end
63
62
  end
64
63
  else
64
+ elements = []
65
65
  collection.each do |element|
66
- yield element
66
+ elements << element
67
+ if elements.size % batch_size == 0
68
+ yield elements
69
+ elements = elements.clear
70
+ end
67
71
  end
68
72
  end
69
73
  end
@@ -72,53 +76,56 @@ module UrlProcessor
72
76
  processed_links = 0
73
77
  hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)
74
78
 
75
- find_in_batches(config.links.call, config.batch_size) do |link|
76
- # any custom pre-processing
77
- pre_process_link(link)
78
-
79
- if link.urls.empty?
80
- # In the event that we have a link that actually has no urls associated with it
81
- report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
82
- else
83
- # Each record has 2 urls associated with it, process each separately
84
- link.urls.each do |url|
85
- config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
86
-
87
- link_request = config.new_link_request.call(
88
- url[:url],
89
- followlocation: true,
90
- method: :head,
91
- ssl_verifypeer: false,
92
- ssl_verifyhost: 2,
93
- cookiefile: config.cookies_file,
94
- cookiejar: config.cookies_file,
95
- link_id: link.id,
96
- url_type_code: url[:url_type_code],
97
- timeout: config.max_timeout,
98
- connecttimeout: config.max_timeout,
99
- max_retries: config.max_retries,
100
- forbid_reuse: 1,
101
- nosignal: 1
102
- )
103
-
104
- link_request.on_complete do |response|
105
- processed_links += 1
106
-
107
- if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
108
- config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
109
- hydra.queue response.request
110
- elsif response.return_code == :got_nothing && response.request.options[:method] != :get
111
- config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
112
-
113
- # set to GET request since HEAD may fail in some cases
114
- response.request.options[:method] = :get
115
- hydra.queue response.request
116
- else
117
- config.process_response.call response
79
+ find_in_batches(config.links.call, config.batch_size) do |group|
80
+
81
+ group.each do |link|
82
+ # any custom pre-processing
83
+ pre_process_link(link)
84
+
85
+ if link.urls.empty?
86
+ # In the event that we have a link that actually has no urls associated with it
87
+ report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
88
+ else
89
+ # Each record has 2 urls associated with it, process each separately
90
+ link.urls.each do |url|
91
+ config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
92
+
93
+ link_request = config.new_link_request.call(
94
+ url[:url],
95
+ followlocation: true,
96
+ method: :head,
97
+ ssl_verifypeer: false,
98
+ ssl_verifyhost: 2,
99
+ cookiefile: config.cookies_file,
100
+ cookiejar: config.cookies_file,
101
+ link_id: link.id,
102
+ url_type_code: url[:url_type_code],
103
+ timeout: config.max_timeout,
104
+ connecttimeout: config.max_timeout,
105
+ max_retries: config.max_retries,
106
+ forbid_reuse: 1,
107
+ nosignal: 1
108
+ )
109
+
110
+ link_request.on_complete do |response|
111
+ processed_links += 1
112
+
113
+ if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
114
+ config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
115
+ hydra.queue response.request
116
+ elsif response.return_code == :got_nothing && response.request.options[:method] != :get
117
+ config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
118
+
119
+ # set to GET request since HEAD may fail in some cases
120
+ response.request.options[:method] = :get
121
+ hydra.queue response.request
122
+ else
123
+ config.process_response.call response
124
+ end
118
125
  end
119
- end
120
126
 
121
- hydra.queue link_request
127
+ hydra.queue link_request
128
+ end
122
129
  end
123
130
  end
124
131
 
@@ -1,3 +1,3 @@
1
1
  module UrlProcessor
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-04-29 00:00:00.000000000 Z
12
+ date: 2014-04-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: typhoeus
@@ -186,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
186
186
  version: '0'
187
187
  segments:
188
188
  - 0
189
- hash: -3648591694633088295
189
+ hash: 1450876520014070707
190
190
  required_rubygems_version: !ruby/object:Gem::Requirement
191
191
  none: false
192
192
  requirements:
@@ -195,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
195
195
  version: '0'
196
196
  segments:
197
197
  - 0
198
- hash: -3648591694633088295
198
+ hash: 1450876520014070707
199
199
  requirements: []
200
200
  rubyforge_project:
201
201
  rubygems_version: 1.8.23.2