aranha 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +3 -5
- data/app/controllers/aranha/addresses_controller.rb +1 -0
- data/app/controllers/aranha/application_controller.rb +2 -0
- data/app/helpers/aranha/application_helper.rb +2 -0
- data/app/models/aranha/address.rb +1 -0
- data/config/routes.rb +2 -0
- data/db/migrate/20171201021251_create_aranha_addresses.rb +1 -0
- data/lib/aranha.rb +6 -0
- data/lib/aranha/default_processor.rb +35 -0
- data/lib/aranha/dom_elements_traverser.rb +44 -0
- data/lib/aranha/dom_elements_traverser/conditions.rb +32 -0
- data/lib/aranha/dom_elements_traverser/cursor.rb +46 -0
- data/lib/aranha/dom_elements_traverser/data.rb +39 -0
- data/lib/aranha/engine.rb +2 -0
- data/lib/aranha/parsers/base.rb +79 -0
- data/lib/aranha/parsers/html/base.rb +28 -0
- data/lib/aranha/parsers/html/item_list.rb +24 -0
- data/lib/aranha/parsers/html/node/base.rb +30 -0
- data/lib/aranha/parsers/html/node/default.rb +93 -0
- data/lib/aranha/processor.rb +16 -14
- data/lib/aranha/version.rb +2 -1
- data/lib/tasks/aranha_tasks.rake +1 -0
- data/test/aranha_test.rb +3 -1
- data/test/dummy/Rakefile +3 -1
- data/test/dummy/app/controllers/application_controller.rb +2 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/bin/bundle +3 -1
- data/test/dummy/bin/rails +3 -1
- data/test/dummy/bin/rake +2 -0
- data/test/dummy/bin/setup +10 -8
- data/test/dummy/config.ru +2 -0
- data/test/dummy/config/application.rb +4 -2
- data/test/dummy/config/boot.rb +4 -2
- data/test/dummy/config/environment.rb +3 -1
- data/test/dummy/config/environments/development.rb +2 -0
- data/test/dummy/config/environments/production.rb +4 -1
- data/test/dummy/config/environments/test.rb +2 -0
- data/test/dummy/config/initializers/assets.rb +2 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +6 -2
- data/test/dummy/config/initializers/cookies_serializer.rb +2 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +2 -0
- data/test/dummy/config/initializers/inflections.rb +2 -0
- data/test/dummy/config/initializers/mime_types.rb +2 -0
- data/test/dummy/config/initializers/session_store.rb +2 -0
- data/test/dummy/config/initializers/to_time_preserves_timezone.rb +2 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +2 -0
- data/test/dummy/config/routes.rb +3 -2
- data/test/integration/navigation_test.rb +2 -0
- data/test/test_helper.rb +4 -3
- metadata +55 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3050cda5e754315e7b80518851f3ce91aa167e8ca8359084203ed61b20c7aa7
|
4
|
+
data.tar.gz: 80525648edf9ba10f99f7ab2bd684a0916ce91ea20892550b8bb6f19d092b7d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b2036fa6ebc24937ac23fda40a8aacbf0a89c3f233d17ddf82c15ecdf1c26bbe36b39b04eeb2f4fa8bfcaa0170477a46338bb74f8ba356d84c591e59944c17c
|
7
|
+
data.tar.gz: 919372af358638d177b31f61dfe65b41a7d7cb81cb50d235b818e63c8de306d102fc0508143dfa6b4da6fffa23ad02d1889f4e364ffc3f5ddb56082552a629a1
|
data/Rakefile
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
begin
|
2
4
|
require 'bundler/setup'
|
3
5
|
rescue LoadError
|
@@ -14,14 +16,11 @@ RDoc::Task.new(:rdoc) do |rdoc|
|
|
14
16
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
15
17
|
end
|
16
18
|
|
17
|
-
APP_RAKEFILE = File.expand_path(
|
19
|
+
APP_RAKEFILE = File.expand_path('test/dummy/Rakefile', __dir__)
|
18
20
|
load 'rails/tasks/engine.rake'
|
19
21
|
|
20
|
-
|
21
22
|
load 'rails/tasks/statistics.rake'
|
22
23
|
|
23
|
-
|
24
|
-
|
25
24
|
Bundler::GemHelper.install_tasks
|
26
25
|
|
27
26
|
require 'rake/testtask'
|
@@ -33,5 +32,4 @@ Rake::TestTask.new(:test) do |t|
|
|
33
32
|
t.verbose = false
|
34
33
|
end
|
35
34
|
|
36
|
-
|
37
35
|
task default: :test
|
data/config/routes.rb
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'httpclient'
|
3
4
|
require 'active_support/dependencies'
|
4
5
|
require_dependency 'aranha/engine'
|
@@ -7,4 +8,9 @@ require_dependency 'active_scaffold'
|
|
7
8
|
module Aranha
|
8
9
|
end
|
9
10
|
|
11
|
+
require_dependency 'aranha/default_processor'
|
10
12
|
require_dependency 'aranha/processor'
|
13
|
+
require_dependency 'aranha/parsers/base'
|
14
|
+
require_dependency 'aranha/parsers/html/base'
|
15
|
+
require_dependency 'aranha/parsers/html/item_list'
|
16
|
+
require_dependency 'aranha/dom_elements_traverser'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DefaultProcessor
|
5
|
+
attr_reader :source_uri
|
6
|
+
|
7
|
+
def initialize(source_uri)
|
8
|
+
unless source_uri.is_a?(Addressable::URI)
|
9
|
+
source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
|
10
|
+
end
|
11
|
+
@source_uri = Addressable::URI.parse(source_uri)
|
12
|
+
end
|
13
|
+
|
14
|
+
def process
|
15
|
+
raise 'Implement method process'
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
def target_uri
|
21
|
+
source_uri
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
@data ||= parser_class.new(target_uri).data
|
26
|
+
end
|
27
|
+
|
28
|
+
def parser_class
|
29
|
+
r = self.class.name.gsub('::Processors::', '::Parsers::').constantize
|
30
|
+
return r unless is_a?(r)
|
31
|
+
|
32
|
+
raise "Parser can be not the process class: #{r}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/dom_elements_traverser/conditions'
|
4
|
+
require_dependency 'aranha/dom_elements_traverser/data'
|
5
|
+
require_dependency 'aranha/dom_elements_traverser/cursor'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
class DomElementsTraverser
|
9
|
+
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
+
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
+
include ::Aranha::DomElementsTraverser::Data
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def traverse(options, &block)
|
15
|
+
new(elements_from_options(options), &block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def empty
|
19
|
+
new([])
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def elements_from_options(options)
|
25
|
+
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
+
elements = nil
|
27
|
+
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
+
raise 'None option of [:children_of] defined' unless elements
|
29
|
+
|
30
|
+
options.validate
|
31
|
+
elements
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def initialize(elements, &block)
|
38
|
+
@elements = elements
|
39
|
+
@index = 0
|
40
|
+
@data = {}
|
41
|
+
instance_eval(&block) if block
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Conditions
|
6
|
+
private
|
7
|
+
|
8
|
+
def match_conditions?(conditions)
|
9
|
+
raise "No element (Conditions: #{conditions})" unless current
|
10
|
+
|
11
|
+
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def match_condition?(key, value)
|
15
|
+
case key.to_sym
|
16
|
+
when :text then match_text_condition?(value)
|
17
|
+
when :name then match_name_condition?(value)
|
18
|
+
else raise "Unknown key condition: (#{key})"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def match_name_condition?(tag_name)
|
23
|
+
current.name.casecmp(tag_name.to_s).zero?
|
24
|
+
end
|
25
|
+
|
26
|
+
def match_text_condition?(texts)
|
27
|
+
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
+
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Cursor
|
6
|
+
private
|
7
|
+
|
8
|
+
def current
|
9
|
+
@elements[@index]
|
10
|
+
end
|
11
|
+
|
12
|
+
def skip
|
13
|
+
@index += 1
|
14
|
+
end
|
15
|
+
|
16
|
+
def skip_until(options)
|
17
|
+
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
18
|
+
optional = oc.consume(:optional, false)
|
19
|
+
while current
|
20
|
+
break if match_conditions?(oc.left_data)
|
21
|
+
|
22
|
+
skip
|
23
|
+
end
|
24
|
+
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
25
|
+
|
26
|
+
current
|
27
|
+
end
|
28
|
+
|
29
|
+
def skip_until_after(conditions)
|
30
|
+
skip_until(conditions)
|
31
|
+
skip
|
32
|
+
current
|
33
|
+
end
|
34
|
+
|
35
|
+
def if_found(conditions, &block)
|
36
|
+
marked = @index
|
37
|
+
skip_until({ optional: true }.merge(conditions))
|
38
|
+
if current
|
39
|
+
instance_eval(&block) if block
|
40
|
+
else
|
41
|
+
@index = marked
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Data
|
6
|
+
def data
|
7
|
+
@data.dup
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def store(key, options = {}, &converter)
|
13
|
+
validate(options)
|
14
|
+
value = store_value(options, converter)
|
15
|
+
@data[key] = value
|
16
|
+
r = current
|
17
|
+
skip
|
18
|
+
r
|
19
|
+
end
|
20
|
+
|
21
|
+
def store_value(options, converter)
|
22
|
+
value = if options.key?(:attribute)
|
23
|
+
current.attribute(options[:attribute]).value
|
24
|
+
else
|
25
|
+
current.text.strip
|
26
|
+
end
|
27
|
+
converter ? converter.call(value) : value
|
28
|
+
end
|
29
|
+
|
30
|
+
def validate(options)
|
31
|
+
return unless options.key?(:validate)
|
32
|
+
return if match_conditions?(options[:validate])
|
33
|
+
|
34
|
+
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
+
" (Element: |#{current}|#{current.name}|)"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/aranha/engine.rb
CHANGED
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
class Base
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def url
|
14
|
+
@url.gsub(%r{/+$}, '')
|
15
|
+
end
|
16
|
+
|
17
|
+
def content
|
18
|
+
s = content_by_url_type
|
19
|
+
log_content(s)
|
20
|
+
s
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def content_by_url_type
|
26
|
+
if @url.is_a?(Hash)
|
27
|
+
content_hash
|
28
|
+
elsif /^http/ =~ @url
|
29
|
+
content_get
|
30
|
+
else
|
31
|
+
content_file
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def content_file
|
36
|
+
::File.open(@url.gsub(%r{\Afile://}, ''), &:read)
|
37
|
+
end
|
38
|
+
|
39
|
+
def content_get
|
40
|
+
content_get_fetch(@url)
|
41
|
+
end
|
42
|
+
|
43
|
+
def content_get_fetch(uri, limit = 10)
|
44
|
+
raise 'too many HTTP redirects' if limit.zero?
|
45
|
+
|
46
|
+
response = Net::HTTP.get_response(URI(uri))
|
47
|
+
|
48
|
+
case response
|
49
|
+
when Net::HTTPSuccess then
|
50
|
+
response.body
|
51
|
+
when Net::HTTPRedirection then
|
52
|
+
content_get_fetch(response['location'], limit - 1)
|
53
|
+
else
|
54
|
+
response.value
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def content_hash
|
59
|
+
return content_post if @url[:method] == :post
|
60
|
+
|
61
|
+
raise "Unknown URL format: #{@url}"
|
62
|
+
end
|
63
|
+
|
64
|
+
def content_post
|
65
|
+
HTTPClient.new.post_content(@url[:url], @url[:params].merge(follow_redirect: true))
|
66
|
+
end
|
67
|
+
|
68
|
+
def log_content(content)
|
69
|
+
File.open(log_file, 'wb') { |file| file.write(content) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def log_file
|
73
|
+
f = Rails.root.join('log', 'parsers', "#{self.class.name.parameterize}.log")
|
74
|
+
FileUtils.mkdir_p(File.dirname(f))
|
75
|
+
f
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/parsers/base'
|
4
|
+
require_dependency 'aranha/parsers/html/node/default'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
class Base < ::Aranha::Parsers::Base
|
10
|
+
def nokogiri
|
11
|
+
@nokogiri ||= Nokogiri::HTML(content, &:noblanks)
|
12
|
+
end
|
13
|
+
|
14
|
+
protected
|
15
|
+
|
16
|
+
def node_parser_class
|
17
|
+
::Aranha::Parsers::Html::Node::Default
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def node_parser
|
23
|
+
@node_parser ||= node_parser_class.new(fields)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
class ItemList < Base
|
7
|
+
def data
|
8
|
+
count = 0
|
9
|
+
@data ||= nokogiri.xpath(items_xpath).map do |m|
|
10
|
+
count += 1
|
11
|
+
node_parser.parse(m)
|
12
|
+
end
|
13
|
+
rescue StandardError => e
|
14
|
+
e.message << " / Count: #{count}"
|
15
|
+
raise e
|
16
|
+
end
|
17
|
+
|
18
|
+
def items_xpath
|
19
|
+
raise "Class #{self.class} has no method \"item_xpath\". Implement it"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
module Node
|
7
|
+
class Base
|
8
|
+
attr_reader :fields
|
9
|
+
|
10
|
+
def initialize(fields)
|
11
|
+
@fields = fields
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(node)
|
15
|
+
Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def parse_field(node, xpath, parser_method)
|
21
|
+
value_method = "#{parser_method}_value"
|
22
|
+
return send(value_method, node, xpath) if respond_to?(value_method)
|
23
|
+
|
24
|
+
raise "Method \"#{value_method}\" not found in #{self.class}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/parsers/html/node/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
module Node
|
9
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
+
def string_value(node, xpath)
|
11
|
+
if node.at_xpath(xpath)
|
12
|
+
node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
|
13
|
+
else
|
14
|
+
''
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def quoted_value(node, xpath)
|
19
|
+
s = string_value(node, xpath)
|
20
|
+
return '' unless s
|
21
|
+
|
22
|
+
m = /\"([^\"]+)\"/.match(s)
|
23
|
+
return m[1] if m
|
24
|
+
|
25
|
+
''
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer_value(node, xpath)
|
29
|
+
r = string_value(node, xpath)
|
30
|
+
return nil if r.blank?
|
31
|
+
|
32
|
+
m = /\d+/.match(r)
|
33
|
+
raise "Integer not found in \"#{r}\"" unless m
|
34
|
+
|
35
|
+
m[0].to_i
|
36
|
+
end
|
37
|
+
|
38
|
+
def integer_optional_value(node, xpath)
|
39
|
+
r = string_value(node, xpath)
|
40
|
+
m = /\d+/.match(r)
|
41
|
+
m ? m[0].to_i : nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def float_value(node, xpath)
|
45
|
+
parse_float(node, xpath, true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def float_optional_value(node, xpath)
|
49
|
+
parse_float(node, xpath, false)
|
50
|
+
end
|
51
|
+
|
52
|
+
def array_value(node, xpath)
|
53
|
+
r = node.xpath(xpath).map { |n| n.text.strip }
|
54
|
+
r.join('|')
|
55
|
+
end
|
56
|
+
|
57
|
+
def join_value(node, xpath)
|
58
|
+
m = ''
|
59
|
+
node.xpath(xpath).each do |n|
|
60
|
+
m << n.text.strip
|
61
|
+
end
|
62
|
+
m
|
63
|
+
end
|
64
|
+
|
65
|
+
def duration_value(node, xpath)
|
66
|
+
m = /(\d+) m/.match(join_value(node, xpath))
|
67
|
+
m ? m[1].to_i : nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def regxep(node, xpath, pattern)
|
71
|
+
s = string_value(node, xpath)
|
72
|
+
m = pattern.match(s)
|
73
|
+
return m if m
|
74
|
+
|
75
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def parse_float(node, xpath, required)
|
81
|
+
s = string_value(node, xpath)
|
82
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
83
|
+
if m
|
84
|
+
m[0].sub(',', '.').to_f
|
85
|
+
elsif required
|
86
|
+
raise "Float value not found in \"#{s}\""
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'net/http'
|
3
4
|
|
4
5
|
module Aranha
|
5
6
|
class Processor
|
6
7
|
NETWORK_EXCEPTIONS = [::HTTPClient::BadResponseError, Errno::ECONNRESET,
|
7
|
-
::Net::HTTPFatalError].freeze
|
8
|
+
::Net::HTTPFatalError, ::HTTPClient::ReceiveTimeoutError].freeze
|
8
9
|
DEFAULT_MAX_TRIES = 3
|
9
10
|
|
10
11
|
def initialize
|
@@ -32,28 +33,29 @@ module Aranha
|
|
32
33
|
false
|
33
34
|
elsif @failed.any?
|
34
35
|
@try += 1
|
35
|
-
max_tries
|
36
|
+
max_tries.positive? && @try >= max_tries
|
36
37
|
else
|
37
38
|
true
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
def process_address(
|
42
|
-
Rails.logger.info("Processing #{
|
42
|
+
def process_address(address)
|
43
|
+
Rails.logger.info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
43
44
|
" Unprocessed: #{unprocessed.count}/#{Aranha::Address.count})")
|
44
45
|
begin
|
45
|
-
|
46
|
-
@failed.delete(
|
46
|
+
address.process
|
47
|
+
@failed.delete(address.id)
|
47
48
|
rescue StandardError => ex
|
48
|
-
process_exception(
|
49
|
+
process_exception(address, ex)
|
49
50
|
end
|
50
51
|
end
|
51
52
|
|
52
|
-
def process_exception(
|
53
|
-
raise
|
54
|
-
|
55
|
-
@failed[
|
56
|
-
|
53
|
+
def process_exception(address, exception)
|
54
|
+
raise exception unless network_exception?(exception)
|
55
|
+
|
56
|
+
@failed[address.id] ||= 0
|
57
|
+
@failed[address.id] += 1
|
58
|
+
Rails.logger.warn(exception)
|
57
59
|
end
|
58
60
|
|
59
61
|
def next_address
|
@@ -64,8 +66,8 @@ module Aranha
|
|
64
66
|
::Aranha::Address.unprocessed
|
65
67
|
end
|
66
68
|
|
67
|
-
def network_exception?(
|
68
|
-
NETWORK_EXCEPTIONS.any? { |klass|
|
69
|
+
def network_exception?(exception)
|
70
|
+
NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
69
71
|
end
|
70
72
|
|
71
73
|
def not_try_ids
|
data/lib/aranha/version.rb
CHANGED
data/lib/tasks/aranha_tasks.rake
CHANGED
data/test/aranha_test.rb
CHANGED
data/test/dummy/Rakefile
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Add your own tasks in files placed in lib/tasks ending in .rake,
|
2
4
|
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
|
3
5
|
|
4
|
-
require File.expand_path('
|
6
|
+
require File.expand_path('config/application', __dir__)
|
5
7
|
|
6
8
|
Rails.application.load_tasks
|
data/test/dummy/bin/bundle
CHANGED
data/test/dummy/bin/rails
CHANGED
data/test/dummy/bin/rake
CHANGED
data/test/dummy/bin/setup
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'pathname'
|
3
5
|
|
4
6
|
# path to your application root.
|
5
|
-
APP_ROOT = Pathname.new File.expand_path('
|
7
|
+
APP_ROOT = Pathname.new File.expand_path('..', __dir__)
|
6
8
|
|
7
9
|
Dir.chdir APP_ROOT do
|
8
10
|
# This script is a starting point to setup your application.
|
9
11
|
# Add necessary setup steps to this file:
|
10
12
|
|
11
|
-
puts
|
12
|
-
system
|
13
|
-
system
|
13
|
+
puts '== Installing dependencies =='
|
14
|
+
system 'gem install bundler --conservative'
|
15
|
+
system 'bundle check || bundle install'
|
14
16
|
|
15
17
|
# puts "\n== Copying sample files =="
|
16
18
|
# unless File.exist?("config/database.yml")
|
@@ -18,12 +20,12 @@ Dir.chdir APP_ROOT do
|
|
18
20
|
# end
|
19
21
|
|
20
22
|
puts "\n== Preparing database =="
|
21
|
-
system
|
23
|
+
system 'bin/rake db:setup'
|
22
24
|
|
23
25
|
puts "\n== Removing old logs and tempfiles =="
|
24
|
-
system
|
25
|
-
system
|
26
|
+
system 'rm -f log/*'
|
27
|
+
system 'rm -rf tmp/cache'
|
26
28
|
|
27
29
|
puts "\n== Restarting application server =="
|
28
|
-
system
|
30
|
+
system 'touch tmp/restart.txt'
|
29
31
|
end
|
data/test/dummy/config.ru
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require File.expand_path('boot', __dir__)
|
2
4
|
|
3
5
|
require 'rails/all'
|
4
6
|
|
5
7
|
Bundler.require(*Rails.groups)
|
6
|
-
require
|
8
|
+
require 'aranha'
|
7
9
|
|
8
10
|
module Dummy
|
9
11
|
class Application < Rails::Application
|
data/test/dummy/config/boot.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Set up gems listed in the Gemfile.
|
2
|
-
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('
|
4
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../../Gemfile', __dir__)
|
3
5
|
|
4
6
|
require 'bundler/setup' if File.exist?(ENV['BUNDLE_GEMFILE'])
|
5
|
-
$LOAD_PATH.unshift File.expand_path('
|
7
|
+
$LOAD_PATH.unshift File.expand_path('../../../lib', __dir__)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
Rails.application.configure do
|
2
4
|
# Settings specified here will take precedence over those in config/application.rb.
|
3
5
|
|
@@ -35,7 +37,8 @@ Rails.application.configure do
|
|
35
37
|
# yet still be able to expire them through the digest params.
|
36
38
|
config.assets.digest = true
|
37
39
|
|
38
|
-
# `config.assets.precompile` and `config.assets.version` have moved to
|
40
|
+
# `config.assets.precompile` and `config.assets.version` have moved to
|
41
|
+
# config/initializers/assets.rb
|
39
42
|
|
40
43
|
# Specifies the header that your server uses for sending files.
|
41
44
|
# config.action_dispatch.x_sendfile_header = 'X-Sendfile' # for Apache
|
@@ -1,7 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Be sure to restart your server when you modify this file.
|
2
4
|
|
3
|
-
# You can add backtrace silencers for libraries that you're using but don't wish to see
|
5
|
+
# You can add backtrace silencers for libraries that you're using but don't wish to see
|
6
|
+
# in your backtraces.
|
4
7
|
# Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ }
|
5
8
|
|
6
|
-
# You can also remove all the silencers if you're trying to debug a problem that might
|
9
|
+
# You can also remove all the silencers if you're trying to debug a problem that might
|
10
|
+
# stem from framework code.
|
7
11
|
# Rails.backtrace_cleaner.remove_silencers!
|
data/test/dummy/config/routes.rb
CHANGED
data/test/test_helper.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Configure Rails Environment
|
3
4
|
ENV['RAILS_ENV'] = 'test'
|
4
5
|
|
5
|
-
require File.expand_path('
|
6
|
+
require File.expand_path('../test/dummy/config/environment.rb', __dir__)
|
6
7
|
ActiveRecord::Migrator.migrations_paths = [
|
7
|
-
File.expand_path('
|
8
|
+
File.expand_path('../test/dummy/db/migrate', __dir__)
|
8
9
|
]
|
9
10
|
require 'rails/test_help'
|
10
11
|
|
@@ -17,6 +18,6 @@ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each { |f| require f }
|
|
17
18
|
|
18
19
|
# Load fixtures from the engine
|
19
20
|
if ActiveSupport::TestCase.respond_to?(:fixture_path=)
|
20
|
-
ActiveSupport::TestCase.fixture_path = File.expand_path('
|
21
|
+
ActiveSupport::TestCase.fixture_path = File.expand_path('fixtures', __dir__)
|
21
22
|
ActiveSupport::TestCase.fixtures :all
|
22
23
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 3.4.41.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: eac_ruby_utils
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '0.3'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '0.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: httpclient
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rails
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 4.2.10
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 4.2.10
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: sqlite3
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,7 +100,17 @@ files:
|
|
86
100
|
- config/routes.rb
|
87
101
|
- db/migrate/20171201021251_create_aranha_addresses.rb
|
88
102
|
- lib/aranha.rb
|
103
|
+
- lib/aranha/default_processor.rb
|
104
|
+
- lib/aranha/dom_elements_traverser.rb
|
105
|
+
- lib/aranha/dom_elements_traverser/conditions.rb
|
106
|
+
- lib/aranha/dom_elements_traverser/cursor.rb
|
107
|
+
- lib/aranha/dom_elements_traverser/data.rb
|
89
108
|
- lib/aranha/engine.rb
|
109
|
+
- lib/aranha/parsers/base.rb
|
110
|
+
- lib/aranha/parsers/html/base.rb
|
111
|
+
- lib/aranha/parsers/html/item_list.rb
|
112
|
+
- lib/aranha/parsers/html/node/base.rb
|
113
|
+
- lib/aranha/parsers/html/node/default.rb
|
90
114
|
- lib/aranha/processor.rb
|
91
115
|
- lib/aranha/version.rb
|
92
116
|
- lib/tasks/aranha_tasks.rake
|
@@ -156,40 +180,40 @@ summary: Rails utilities for web crawling.
|
|
156
180
|
test_files:
|
157
181
|
- test/dummy/Rakefile
|
158
182
|
- test/dummy/README.rdoc
|
159
|
-
- test/dummy/
|
160
|
-
- test/dummy/bin/rake
|
161
|
-
- test/dummy/bin/bundle
|
162
|
-
- test/dummy/bin/setup
|
163
|
-
- test/dummy/config/initializers/assets.rb
|
164
|
-
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
165
|
-
- test/dummy/config/initializers/wrap_parameters.rb
|
166
|
-
- test/dummy/config/initializers/session_store.rb
|
167
|
-
- test/dummy/config/initializers/cookies_serializer.rb
|
168
|
-
- test/dummy/config/initializers/inflections.rb
|
169
|
-
- test/dummy/config/initializers/mime_types.rb
|
170
|
-
- test/dummy/config/initializers/backtrace_silencers.rb
|
171
|
-
- test/dummy/config/initializers/filter_parameter_logging.rb
|
183
|
+
- test/dummy/config.ru
|
172
184
|
- test/dummy/config/boot.rb
|
173
|
-
- test/dummy/config/locales/en.yml
|
174
|
-
- test/dummy/config/secrets.yml
|
175
|
-
- test/dummy/config/environment.rb
|
176
185
|
- test/dummy/config/database.yml
|
177
|
-
- test/dummy/config/
|
186
|
+
- test/dummy/config/secrets.yml
|
187
|
+
- test/dummy/config/locales/en.yml
|
188
|
+
- test/dummy/config/application.rb
|
178
189
|
- test/dummy/config/environments/development.rb
|
179
190
|
- test/dummy/config/environments/test.rb
|
180
191
|
- test/dummy/config/environments/production.rb
|
181
|
-
- test/dummy/config/
|
182
|
-
- test/dummy/
|
183
|
-
- test/dummy/
|
192
|
+
- test/dummy/config/environment.rb
|
193
|
+
- test/dummy/config/routes.rb
|
194
|
+
- test/dummy/config/initializers/assets.rb
|
195
|
+
- test/dummy/config/initializers/cookies_serializer.rb
|
196
|
+
- test/dummy/config/initializers/inflections.rb
|
197
|
+
- test/dummy/config/initializers/session_store.rb
|
198
|
+
- test/dummy/config/initializers/wrap_parameters.rb
|
199
|
+
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
200
|
+
- test/dummy/config/initializers/filter_parameter_logging.rb
|
201
|
+
- test/dummy/config/initializers/backtrace_silencers.rb
|
202
|
+
- test/dummy/config/initializers/mime_types.rb
|
203
|
+
- test/dummy/db/schema.rb
|
184
204
|
- test/dummy/app/views/layouts/application.html.erb
|
185
|
-
- test/dummy/app/helpers/application_helper.rb
|
186
205
|
- test/dummy/app/controllers/application_controller.rb
|
187
|
-
- test/dummy/
|
188
|
-
- test/dummy/
|
189
|
-
- test/dummy/
|
206
|
+
- test/dummy/app/helpers/application_helper.rb
|
207
|
+
- test/dummy/app/assets/stylesheets/application.css
|
208
|
+
- test/dummy/app/assets/javascripts/application.js
|
190
209
|
- test/dummy/public/422.html
|
191
210
|
- test/dummy/public/404.html
|
192
|
-
- test/dummy/
|
193
|
-
- test/
|
211
|
+
- test/dummy/public/favicon.ico
|
212
|
+
- test/dummy/public/500.html
|
213
|
+
- test/dummy/bin/bundle
|
214
|
+
- test/dummy/bin/setup
|
215
|
+
- test/dummy/bin/rails
|
216
|
+
- test/dummy/bin/rake
|
194
217
|
- test/aranha_test.rb
|
218
|
+
- test/test_helper.rb
|
195
219
|
- test/integration/navigation_test.rb
|