aranha 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -5
- data/app/controllers/aranha/addresses_controller.rb +1 -0
- data/app/controllers/aranha/application_controller.rb +2 -0
- data/app/helpers/aranha/application_helper.rb +2 -0
- data/app/models/aranha/address.rb +1 -0
- data/config/routes.rb +2 -0
- data/db/migrate/20171201021251_create_aranha_addresses.rb +1 -0
- data/lib/aranha.rb +6 -0
- data/lib/aranha/default_processor.rb +35 -0
- data/lib/aranha/dom_elements_traverser.rb +44 -0
- data/lib/aranha/dom_elements_traverser/conditions.rb +32 -0
- data/lib/aranha/dom_elements_traverser/cursor.rb +46 -0
- data/lib/aranha/dom_elements_traverser/data.rb +39 -0
- data/lib/aranha/engine.rb +2 -0
- data/lib/aranha/parsers/base.rb +79 -0
- data/lib/aranha/parsers/html/base.rb +28 -0
- data/lib/aranha/parsers/html/item_list.rb +24 -0
- data/lib/aranha/parsers/html/node/base.rb +30 -0
- data/lib/aranha/parsers/html/node/default.rb +93 -0
- data/lib/aranha/processor.rb +16 -14
- data/lib/aranha/version.rb +2 -1
- data/lib/tasks/aranha_tasks.rake +1 -0
- data/test/aranha_test.rb +3 -1
- data/test/dummy/Rakefile +3 -1
- data/test/dummy/app/controllers/application_controller.rb +2 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/bin/bundle +3 -1
- data/test/dummy/bin/rails +3 -1
- data/test/dummy/bin/rake +2 -0
- data/test/dummy/bin/setup +10 -8
- data/test/dummy/config.ru +2 -0
- data/test/dummy/config/application.rb +4 -2
- data/test/dummy/config/boot.rb +4 -2
- data/test/dummy/config/environment.rb +3 -1
- data/test/dummy/config/environments/development.rb +2 -0
- data/test/dummy/config/environments/production.rb +4 -1
- data/test/dummy/config/environments/test.rb +2 -0
- data/test/dummy/config/initializers/assets.rb +2 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +6 -2
- data/test/dummy/config/initializers/cookies_serializer.rb +2 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +2 -0
- data/test/dummy/config/initializers/inflections.rb +2 -0
- data/test/dummy/config/initializers/mime_types.rb +2 -0
- data/test/dummy/config/initializers/session_store.rb +2 -0
- data/test/dummy/config/initializers/to_time_preserves_timezone.rb +2 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +2 -0
- data/test/dummy/config/routes.rb +3 -2
- data/test/integration/navigation_test.rb +2 -0
- data/test/test_helper.rb +4 -3
- metadata +55 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3050cda5e754315e7b80518851f3ce91aa167e8ca8359084203ed61b20c7aa7
|
4
|
+
data.tar.gz: 80525648edf9ba10f99f7ab2bd684a0916ce91ea20892550b8bb6f19d092b7d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b2036fa6ebc24937ac23fda40a8aacbf0a89c3f233d17ddf82c15ecdf1c26bbe36b39b04eeb2f4fa8bfcaa0170477a46338bb74f8ba356d84c591e59944c17c
|
7
|
+
data.tar.gz: 919372af358638d177b31f61dfe65b41a7d7cb81cb50d235b818e63c8de306d102fc0508143dfa6b4da6fffa23ad02d1889f4e364ffc3f5ddb56082552a629a1
|
data/Rakefile
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
begin
|
2
4
|
require 'bundler/setup'
|
3
5
|
rescue LoadError
|
@@ -14,14 +16,11 @@ RDoc::Task.new(:rdoc) do |rdoc|
|
|
14
16
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
15
17
|
end
|
16
18
|
|
17
|
-
APP_RAKEFILE = File.expand_path(
|
19
|
+
APP_RAKEFILE = File.expand_path('test/dummy/Rakefile', __dir__)
|
18
20
|
load 'rails/tasks/engine.rake'
|
19
21
|
|
20
|
-
|
21
22
|
load 'rails/tasks/statistics.rake'
|
22
23
|
|
23
|
-
|
24
|
-
|
25
24
|
Bundler::GemHelper.install_tasks
|
26
25
|
|
27
26
|
require 'rake/testtask'
|
@@ -33,5 +32,4 @@ Rake::TestTask.new(:test) do |t|
|
|
33
32
|
t.verbose = false
|
34
33
|
end
|
35
34
|
|
36
|
-
|
37
35
|
task default: :test
|
data/config/routes.rb
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'httpclient'
|
3
4
|
require 'active_support/dependencies'
|
4
5
|
require_dependency 'aranha/engine'
|
@@ -7,4 +8,9 @@ require_dependency 'active_scaffold'
|
|
7
8
|
module Aranha
|
8
9
|
end
|
9
10
|
|
11
|
+
require_dependency 'aranha/default_processor'
|
10
12
|
require_dependency 'aranha/processor'
|
13
|
+
require_dependency 'aranha/parsers/base'
|
14
|
+
require_dependency 'aranha/parsers/html/base'
|
15
|
+
require_dependency 'aranha/parsers/html/item_list'
|
16
|
+
require_dependency 'aranha/dom_elements_traverser'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DefaultProcessor
|
5
|
+
attr_reader :source_uri
|
6
|
+
|
7
|
+
def initialize(source_uri)
|
8
|
+
unless source_uri.is_a?(Addressable::URI)
|
9
|
+
source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
|
10
|
+
end
|
11
|
+
@source_uri = Addressable::URI.parse(source_uri)
|
12
|
+
end
|
13
|
+
|
14
|
+
def process
|
15
|
+
raise 'Implement method process'
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
def target_uri
|
21
|
+
source_uri
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
@data ||= parser_class.new(target_uri).data
|
26
|
+
end
|
27
|
+
|
28
|
+
def parser_class
|
29
|
+
r = self.class.name.gsub('::Processors::', '::Parsers::').constantize
|
30
|
+
return r unless is_a?(r)
|
31
|
+
|
32
|
+
raise "Parser can be not the process class: #{r}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/dom_elements_traverser/conditions'
|
4
|
+
require_dependency 'aranha/dom_elements_traverser/data'
|
5
|
+
require_dependency 'aranha/dom_elements_traverser/cursor'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
class DomElementsTraverser
|
9
|
+
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
+
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
+
include ::Aranha::DomElementsTraverser::Data
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def traverse(options, &block)
|
15
|
+
new(elements_from_options(options), &block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def empty
|
19
|
+
new([])
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def elements_from_options(options)
|
25
|
+
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
+
elements = nil
|
27
|
+
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
+
raise 'None option of [:children_of] defined' unless elements
|
29
|
+
|
30
|
+
options.validate
|
31
|
+
elements
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def initialize(elements, &block)
|
38
|
+
@elements = elements
|
39
|
+
@index = 0
|
40
|
+
@data = {}
|
41
|
+
instance_eval(&block) if block
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Conditions
|
6
|
+
private
|
7
|
+
|
8
|
+
def match_conditions?(conditions)
|
9
|
+
raise "No element (Conditions: #{conditions})" unless current
|
10
|
+
|
11
|
+
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def match_condition?(key, value)
|
15
|
+
case key.to_sym
|
16
|
+
when :text then match_text_condition?(value)
|
17
|
+
when :name then match_name_condition?(value)
|
18
|
+
else raise "Unknown key condition: (#{key})"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def match_name_condition?(tag_name)
|
23
|
+
current.name.casecmp(tag_name.to_s).zero?
|
24
|
+
end
|
25
|
+
|
26
|
+
def match_text_condition?(texts)
|
27
|
+
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
+
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Cursor
|
6
|
+
private
|
7
|
+
|
8
|
+
def current
|
9
|
+
@elements[@index]
|
10
|
+
end
|
11
|
+
|
12
|
+
def skip
|
13
|
+
@index += 1
|
14
|
+
end
|
15
|
+
|
16
|
+
def skip_until(options)
|
17
|
+
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
18
|
+
optional = oc.consume(:optional, false)
|
19
|
+
while current
|
20
|
+
break if match_conditions?(oc.left_data)
|
21
|
+
|
22
|
+
skip
|
23
|
+
end
|
24
|
+
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
25
|
+
|
26
|
+
current
|
27
|
+
end
|
28
|
+
|
29
|
+
def skip_until_after(conditions)
|
30
|
+
skip_until(conditions)
|
31
|
+
skip
|
32
|
+
current
|
33
|
+
end
|
34
|
+
|
35
|
+
def if_found(conditions, &block)
|
36
|
+
marked = @index
|
37
|
+
skip_until({ optional: true }.merge(conditions))
|
38
|
+
if current
|
39
|
+
instance_eval(&block) if block
|
40
|
+
else
|
41
|
+
@index = marked
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
class DomElementsTraverser
|
5
|
+
module Data
|
6
|
+
def data
|
7
|
+
@data.dup
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def store(key, options = {}, &converter)
|
13
|
+
validate(options)
|
14
|
+
value = store_value(options, converter)
|
15
|
+
@data[key] = value
|
16
|
+
r = current
|
17
|
+
skip
|
18
|
+
r
|
19
|
+
end
|
20
|
+
|
21
|
+
def store_value(options, converter)
|
22
|
+
value = if options.key?(:attribute)
|
23
|
+
current.attribute(options[:attribute]).value
|
24
|
+
else
|
25
|
+
current.text.strip
|
26
|
+
end
|
27
|
+
converter ? converter.call(value) : value
|
28
|
+
end
|
29
|
+
|
30
|
+
def validate(options)
|
31
|
+
return unless options.key?(:validate)
|
32
|
+
return if match_conditions?(options[:validate])
|
33
|
+
|
34
|
+
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
+
" (Element: |#{current}|#{current.name}|)"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/aranha/engine.rb
CHANGED
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
class Base
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def url
|
14
|
+
@url.gsub(%r{/+$}, '')
|
15
|
+
end
|
16
|
+
|
17
|
+
def content
|
18
|
+
s = content_by_url_type
|
19
|
+
log_content(s)
|
20
|
+
s
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def content_by_url_type
|
26
|
+
if @url.is_a?(Hash)
|
27
|
+
content_hash
|
28
|
+
elsif /^http/ =~ @url
|
29
|
+
content_get
|
30
|
+
else
|
31
|
+
content_file
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def content_file
|
36
|
+
::File.open(@url.gsub(%r{\Afile://}, ''), &:read)
|
37
|
+
end
|
38
|
+
|
39
|
+
def content_get
|
40
|
+
content_get_fetch(@url)
|
41
|
+
end
|
42
|
+
|
43
|
+
def content_get_fetch(uri, limit = 10)
|
44
|
+
raise 'too many HTTP redirects' if limit.zero?
|
45
|
+
|
46
|
+
response = Net::HTTP.get_response(URI(uri))
|
47
|
+
|
48
|
+
case response
|
49
|
+
when Net::HTTPSuccess then
|
50
|
+
response.body
|
51
|
+
when Net::HTTPRedirection then
|
52
|
+
content_get_fetch(response['location'], limit - 1)
|
53
|
+
else
|
54
|
+
response.value
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def content_hash
|
59
|
+
return content_post if @url[:method] == :post
|
60
|
+
|
61
|
+
raise "Unknown URL format: #{@url}"
|
62
|
+
end
|
63
|
+
|
64
|
+
def content_post
|
65
|
+
HTTPClient.new.post_content(@url[:url], @url[:params].merge(follow_redirect: true))
|
66
|
+
end
|
67
|
+
|
68
|
+
def log_content(content)
|
69
|
+
File.open(log_file, 'wb') { |file| file.write(content) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def log_file
|
73
|
+
f = Rails.root.join('log', 'parsers', "#{self.class.name.parameterize}.log")
|
74
|
+
FileUtils.mkdir_p(File.dirname(f))
|
75
|
+
f
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/parsers/base'
|
4
|
+
require_dependency 'aranha/parsers/html/node/default'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
class Base < ::Aranha::Parsers::Base
|
10
|
+
def nokogiri
|
11
|
+
@nokogiri ||= Nokogiri::HTML(content, &:noblanks)
|
12
|
+
end
|
13
|
+
|
14
|
+
protected
|
15
|
+
|
16
|
+
def node_parser_class
|
17
|
+
::Aranha::Parsers::Html::Node::Default
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def node_parser
|
23
|
+
@node_parser ||= node_parser_class.new(fields)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
class ItemList < Base
|
7
|
+
def data
|
8
|
+
count = 0
|
9
|
+
@data ||= nokogiri.xpath(items_xpath).map do |m|
|
10
|
+
count += 1
|
11
|
+
node_parser.parse(m)
|
12
|
+
end
|
13
|
+
rescue StandardError => e
|
14
|
+
e.message << " / Count: #{count}"
|
15
|
+
raise e
|
16
|
+
end
|
17
|
+
|
18
|
+
def items_xpath
|
19
|
+
raise "Class #{self.class} has no method \"item_xpath\". Implement it"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
module Node
|
7
|
+
class Base
|
8
|
+
attr_reader :fields
|
9
|
+
|
10
|
+
def initialize(fields)
|
11
|
+
@fields = fields
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(node)
|
15
|
+
Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def parse_field(node, xpath, parser_method)
|
21
|
+
value_method = "#{parser_method}_value"
|
22
|
+
return send(value_method, node, xpath) if respond_to?(value_method)
|
23
|
+
|
24
|
+
raise "Method \"#{value_method}\" not found in #{self.class}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_dependency 'aranha/parsers/html/node/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
module Node
|
9
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
+
def string_value(node, xpath)
|
11
|
+
if node.at_xpath(xpath)
|
12
|
+
node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
|
13
|
+
else
|
14
|
+
''
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def quoted_value(node, xpath)
|
19
|
+
s = string_value(node, xpath)
|
20
|
+
return '' unless s
|
21
|
+
|
22
|
+
m = /\"([^\"]+)\"/.match(s)
|
23
|
+
return m[1] if m
|
24
|
+
|
25
|
+
''
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer_value(node, xpath)
|
29
|
+
r = string_value(node, xpath)
|
30
|
+
return nil if r.blank?
|
31
|
+
|
32
|
+
m = /\d+/.match(r)
|
33
|
+
raise "Integer not found in \"#{r}\"" unless m
|
34
|
+
|
35
|
+
m[0].to_i
|
36
|
+
end
|
37
|
+
|
38
|
+
def integer_optional_value(node, xpath)
|
39
|
+
r = string_value(node, xpath)
|
40
|
+
m = /\d+/.match(r)
|
41
|
+
m ? m[0].to_i : nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def float_value(node, xpath)
|
45
|
+
parse_float(node, xpath, true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def float_optional_value(node, xpath)
|
49
|
+
parse_float(node, xpath, false)
|
50
|
+
end
|
51
|
+
|
52
|
+
def array_value(node, xpath)
|
53
|
+
r = node.xpath(xpath).map { |n| n.text.strip }
|
54
|
+
r.join('|')
|
55
|
+
end
|
56
|
+
|
57
|
+
def join_value(node, xpath)
|
58
|
+
m = ''
|
59
|
+
node.xpath(xpath).each do |n|
|
60
|
+
m << n.text.strip
|
61
|
+
end
|
62
|
+
m
|
63
|
+
end
|
64
|
+
|
65
|
+
def duration_value(node, xpath)
|
66
|
+
m = /(\d+) m/.match(join_value(node, xpath))
|
67
|
+
m ? m[1].to_i : nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def regxep(node, xpath, pattern)
|
71
|
+
s = string_value(node, xpath)
|
72
|
+
m = pattern.match(s)
|
73
|
+
return m if m
|
74
|
+
|
75
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def parse_float(node, xpath, required)
|
81
|
+
s = string_value(node, xpath)
|
82
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
83
|
+
if m
|
84
|
+
m[0].sub(',', '.').to_f
|
85
|
+
elsif required
|
86
|
+
raise "Float value not found in \"#{s}\""
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'net/http'
|
3
4
|
|
4
5
|
module Aranha
|
5
6
|
class Processor
|
6
7
|
NETWORK_EXCEPTIONS = [::HTTPClient::BadResponseError, Errno::ECONNRESET,
|
7
|
-
::Net::HTTPFatalError].freeze
|
8
|
+
::Net::HTTPFatalError, ::HTTPClient::ReceiveTimeoutError].freeze
|
8
9
|
DEFAULT_MAX_TRIES = 3
|
9
10
|
|
10
11
|
def initialize
|
@@ -32,28 +33,29 @@ module Aranha
|
|
32
33
|
false
|
33
34
|
elsif @failed.any?
|
34
35
|
@try += 1
|
35
|
-
max_tries
|
36
|
+
max_tries.positive? && @try >= max_tries
|
36
37
|
else
|
37
38
|
true
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
def process_address(
|
42
|
-
Rails.logger.info("Processing #{
|
42
|
+
def process_address(address)
|
43
|
+
Rails.logger.info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
43
44
|
" Unprocessed: #{unprocessed.count}/#{Aranha::Address.count})")
|
44
45
|
begin
|
45
|
-
|
46
|
-
@failed.delete(
|
46
|
+
address.process
|
47
|
+
@failed.delete(address.id)
|
47
48
|
rescue StandardError => ex
|
48
|
-
process_exception(
|
49
|
+
process_exception(address, ex)
|
49
50
|
end
|
50
51
|
end
|
51
52
|
|
52
|
-
def process_exception(
|
53
|
-
raise
|
54
|
-
|
55
|
-
@failed[
|
56
|
-
|
53
|
+
def process_exception(address, exception)
|
54
|
+
raise exception unless network_exception?(exception)
|
55
|
+
|
56
|
+
@failed[address.id] ||= 0
|
57
|
+
@failed[address.id] += 1
|
58
|
+
Rails.logger.warn(exception)
|
57
59
|
end
|
58
60
|
|
59
61
|
def next_address
|
@@ -64,8 +66,8 @@ module Aranha
|
|
64
66
|
::Aranha::Address.unprocessed
|
65
67
|
end
|
66
68
|
|
67
|
-
def network_exception?(
|
68
|
-
NETWORK_EXCEPTIONS.any? { |klass|
|
69
|
+
def network_exception?(exception)
|
70
|
+
NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
69
71
|
end
|
70
72
|
|
71
73
|
def not_try_ids
|
data/lib/aranha/version.rb
CHANGED
data/lib/tasks/aranha_tasks.rake
CHANGED
data/test/aranha_test.rb
CHANGED
data/test/dummy/Rakefile
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Add your own tasks in files placed in lib/tasks ending in .rake,
|
2
4
|
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
|
3
5
|
|
4
|
-
require File.expand_path('
|
6
|
+
require File.expand_path('config/application', __dir__)
|
5
7
|
|
6
8
|
Rails.application.load_tasks
|
data/test/dummy/bin/bundle
CHANGED
data/test/dummy/bin/rails
CHANGED
data/test/dummy/bin/rake
CHANGED
data/test/dummy/bin/setup
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'pathname'
|
3
5
|
|
4
6
|
# path to your application root.
|
5
|
-
APP_ROOT = Pathname.new File.expand_path('
|
7
|
+
APP_ROOT = Pathname.new File.expand_path('..', __dir__)
|
6
8
|
|
7
9
|
Dir.chdir APP_ROOT do
|
8
10
|
# This script is a starting point to setup your application.
|
9
11
|
# Add necessary setup steps to this file:
|
10
12
|
|
11
|
-
puts
|
12
|
-
system
|
13
|
-
system
|
13
|
+
puts '== Installing dependencies =='
|
14
|
+
system 'gem install bundler --conservative'
|
15
|
+
system 'bundle check || bundle install'
|
14
16
|
|
15
17
|
# puts "\n== Copying sample files =="
|
16
18
|
# unless File.exist?("config/database.yml")
|
@@ -18,12 +20,12 @@ Dir.chdir APP_ROOT do
|
|
18
20
|
# end
|
19
21
|
|
20
22
|
puts "\n== Preparing database =="
|
21
|
-
system
|
23
|
+
system 'bin/rake db:setup'
|
22
24
|
|
23
25
|
puts "\n== Removing old logs and tempfiles =="
|
24
|
-
system
|
25
|
-
system
|
26
|
+
system 'rm -f log/*'
|
27
|
+
system 'rm -rf tmp/cache'
|
26
28
|
|
27
29
|
puts "\n== Restarting application server =="
|
28
|
-
system
|
30
|
+
system 'touch tmp/restart.txt'
|
29
31
|
end
|
data/test/dummy/config.ru
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require File.expand_path('boot', __dir__)
|
2
4
|
|
3
5
|
require 'rails/all'
|
4
6
|
|
5
7
|
Bundler.require(*Rails.groups)
|
6
|
-
require
|
8
|
+
require 'aranha'
|
7
9
|
|
8
10
|
module Dummy
|
9
11
|
class Application < Rails::Application
|
data/test/dummy/config/boot.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Set up gems listed in the Gemfile.
|
2
|
-
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('
|
4
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../../Gemfile', __dir__)
|
3
5
|
|
4
6
|
require 'bundler/setup' if File.exist?(ENV['BUNDLE_GEMFILE'])
|
5
|
-
$LOAD_PATH.unshift File.expand_path('
|
7
|
+
$LOAD_PATH.unshift File.expand_path('../../../lib', __dir__)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
Rails.application.configure do
|
2
4
|
# Settings specified here will take precedence over those in config/application.rb.
|
3
5
|
|
@@ -35,7 +37,8 @@ Rails.application.configure do
|
|
35
37
|
# yet still be able to expire them through the digest params.
|
36
38
|
config.assets.digest = true
|
37
39
|
|
38
|
-
# `config.assets.precompile` and `config.assets.version` have moved to
|
40
|
+
# `config.assets.precompile` and `config.assets.version` have moved to
|
41
|
+
# config/initializers/assets.rb
|
39
42
|
|
40
43
|
# Specifies the header that your server uses for sending files.
|
41
44
|
# config.action_dispatch.x_sendfile_header = 'X-Sendfile' # for Apache
|
@@ -1,7 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Be sure to restart your server when you modify this file.
|
2
4
|
|
3
|
-
# You can add backtrace silencers for libraries that you're using but don't wish to see
|
5
|
+
# You can add backtrace silencers for libraries that you're using but don't wish to see
|
6
|
+
# in your backtraces.
|
4
7
|
# Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ }
|
5
8
|
|
6
|
-
# You can also remove all the silencers if you're trying to debug a problem that might
|
9
|
+
# You can also remove all the silencers if you're trying to debug a problem that might
|
10
|
+
# stem from framework code.
|
7
11
|
# Rails.backtrace_cleaner.remove_silencers!
|
data/test/dummy/config/routes.rb
CHANGED
data/test/test_helper.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Configure Rails Environment
|
3
4
|
ENV['RAILS_ENV'] = 'test'
|
4
5
|
|
5
|
-
require File.expand_path('
|
6
|
+
require File.expand_path('../test/dummy/config/environment.rb', __dir__)
|
6
7
|
ActiveRecord::Migrator.migrations_paths = [
|
7
|
-
File.expand_path('
|
8
|
+
File.expand_path('../test/dummy/db/migrate', __dir__)
|
8
9
|
]
|
9
10
|
require 'rails/test_help'
|
10
11
|
|
@@ -17,6 +18,6 @@ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each { |f| require f }
|
|
17
18
|
|
18
19
|
# Load fixtures from the engine
|
19
20
|
if ActiveSupport::TestCase.respond_to?(:fixture_path=)
|
20
|
-
ActiveSupport::TestCase.fixture_path = File.expand_path('
|
21
|
+
ActiveSupport::TestCase.fixture_path = File.expand_path('fixtures', __dir__)
|
21
22
|
ActiveSupport::TestCase.fixtures :all
|
22
23
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 3.4.41.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: eac_ruby_utils
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '0.3'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '0.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: httpclient
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rails
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 4.2.10
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 4.2.10
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: sqlite3
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,7 +100,17 @@ files:
|
|
86
100
|
- config/routes.rb
|
87
101
|
- db/migrate/20171201021251_create_aranha_addresses.rb
|
88
102
|
- lib/aranha.rb
|
103
|
+
- lib/aranha/default_processor.rb
|
104
|
+
- lib/aranha/dom_elements_traverser.rb
|
105
|
+
- lib/aranha/dom_elements_traverser/conditions.rb
|
106
|
+
- lib/aranha/dom_elements_traverser/cursor.rb
|
107
|
+
- lib/aranha/dom_elements_traverser/data.rb
|
89
108
|
- lib/aranha/engine.rb
|
109
|
+
- lib/aranha/parsers/base.rb
|
110
|
+
- lib/aranha/parsers/html/base.rb
|
111
|
+
- lib/aranha/parsers/html/item_list.rb
|
112
|
+
- lib/aranha/parsers/html/node/base.rb
|
113
|
+
- lib/aranha/parsers/html/node/default.rb
|
90
114
|
- lib/aranha/processor.rb
|
91
115
|
- lib/aranha/version.rb
|
92
116
|
- lib/tasks/aranha_tasks.rake
|
@@ -156,40 +180,40 @@ summary: Rails utilities for web crawling.
|
|
156
180
|
test_files:
|
157
181
|
- test/dummy/Rakefile
|
158
182
|
- test/dummy/README.rdoc
|
159
|
-
- test/dummy/
|
160
|
-
- test/dummy/bin/rake
|
161
|
-
- test/dummy/bin/bundle
|
162
|
-
- test/dummy/bin/setup
|
163
|
-
- test/dummy/config/initializers/assets.rb
|
164
|
-
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
165
|
-
- test/dummy/config/initializers/wrap_parameters.rb
|
166
|
-
- test/dummy/config/initializers/session_store.rb
|
167
|
-
- test/dummy/config/initializers/cookies_serializer.rb
|
168
|
-
- test/dummy/config/initializers/inflections.rb
|
169
|
-
- test/dummy/config/initializers/mime_types.rb
|
170
|
-
- test/dummy/config/initializers/backtrace_silencers.rb
|
171
|
-
- test/dummy/config/initializers/filter_parameter_logging.rb
|
183
|
+
- test/dummy/config.ru
|
172
184
|
- test/dummy/config/boot.rb
|
173
|
-
- test/dummy/config/locales/en.yml
|
174
|
-
- test/dummy/config/secrets.yml
|
175
|
-
- test/dummy/config/environment.rb
|
176
185
|
- test/dummy/config/database.yml
|
177
|
-
- test/dummy/config/
|
186
|
+
- test/dummy/config/secrets.yml
|
187
|
+
- test/dummy/config/locales/en.yml
|
188
|
+
- test/dummy/config/application.rb
|
178
189
|
- test/dummy/config/environments/development.rb
|
179
190
|
- test/dummy/config/environments/test.rb
|
180
191
|
- test/dummy/config/environments/production.rb
|
181
|
-
- test/dummy/config/
|
182
|
-
- test/dummy/
|
183
|
-
- test/dummy/
|
192
|
+
- test/dummy/config/environment.rb
|
193
|
+
- test/dummy/config/routes.rb
|
194
|
+
- test/dummy/config/initializers/assets.rb
|
195
|
+
- test/dummy/config/initializers/cookies_serializer.rb
|
196
|
+
- test/dummy/config/initializers/inflections.rb
|
197
|
+
- test/dummy/config/initializers/session_store.rb
|
198
|
+
- test/dummy/config/initializers/wrap_parameters.rb
|
199
|
+
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
200
|
+
- test/dummy/config/initializers/filter_parameter_logging.rb
|
201
|
+
- test/dummy/config/initializers/backtrace_silencers.rb
|
202
|
+
- test/dummy/config/initializers/mime_types.rb
|
203
|
+
- test/dummy/db/schema.rb
|
184
204
|
- test/dummy/app/views/layouts/application.html.erb
|
185
|
-
- test/dummy/app/helpers/application_helper.rb
|
186
205
|
- test/dummy/app/controllers/application_controller.rb
|
187
|
-
- test/dummy/
|
188
|
-
- test/dummy/
|
189
|
-
- test/dummy/
|
206
|
+
- test/dummy/app/helpers/application_helper.rb
|
207
|
+
- test/dummy/app/assets/stylesheets/application.css
|
208
|
+
- test/dummy/app/assets/javascripts/application.js
|
190
209
|
- test/dummy/public/422.html
|
191
210
|
- test/dummy/public/404.html
|
192
|
-
- test/dummy/
|
193
|
-
- test/
|
211
|
+
- test/dummy/public/favicon.ico
|
212
|
+
- test/dummy/public/500.html
|
213
|
+
- test/dummy/bin/bundle
|
214
|
+
- test/dummy/bin/setup
|
215
|
+
- test/dummy/bin/rails
|
216
|
+
- test/dummy/bin/rake
|
194
217
|
- test/aranha_test.rb
|
218
|
+
- test/test_helper.rb
|
195
219
|
- test/integration/navigation_test.rb
|