apify_core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +79 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +3 -0
- data/apify_core.gemspec +34 -0
- data/bin/bundler +16 -0
- data/bin/coderay +16 -0
- data/bin/htmldiff +16 -0
- data/bin/ldiff +16 -0
- data/bin/nokogiri +16 -0
- data/bin/pry +16 -0
- data/bin/rackup +16 -0
- data/bin/rake +16 -0
- data/bin/rspec +16 -0
- data/bin/server +16 -0
- data/bin/tilt +16 -0
- data/lib/apify_core/fetcher.rb +190 -0
- data/lib/apify_core/filter.rb +83 -0
- data/lib/apify_core/parser.rb +68 -0
- data/lib/apify_core/version.rb +5 -0
- data/lib/apify_core.rb +19 -0
- data/spec/complex_spec.rb +736 -0
- data/spec/examples/apify_request.json +62 -0
- data/spec/examples/apify_response.json +1399 -0
- data/spec/examples/github_blog_request.json +24 -0
- data/spec/examples/oblomoff_events_request.json +21 -0
- data/spec/examples/vgorode_dn_events_request.json +23 -0
- data/spec/examples/vgorode_dp_events_request.json +23 -0
- data/spec/examples/vgorode_kh_events_request.json +23 -0
- data/spec/examples/vgorode_kiev_events_request.json +23 -0
- data/spec/examples/vgorode_lg_events_request.json +23 -0
- data/spec/examples/vgorode_lviv_events_request.json +23 -0
- data/spec/examples/vgorode_od_events_request.json +23 -0
- data/spec/examples/vgorode_zp_events_request.json +23 -0
- data/spec/spec_helper.rb +8 -0
- metadata +247 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
def initialize(html="", pattern={})
|
6
|
+
@html, @pattern, @doc, @result = html, pattern, ::Nokogiri::HTML(html), {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def perform
|
10
|
+
@pattern.each do |key,value|
|
11
|
+
next if key == '__iterator__'
|
12
|
+
@result[key] = process(value, @doc)
|
13
|
+
end
|
14
|
+
@result
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.fetch(expression, docs)
|
18
|
+
docs.map{ |html| ::Nokogiri::HTML(html).search(expression) }.flatten
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def process(value, context)
|
23
|
+
case value
|
24
|
+
when Hash
|
25
|
+
if value.keys.include?('__iterator__')
|
26
|
+
new_hash = []
|
27
|
+
context = context.search(value['__iterator__'].strip)
|
28
|
+
context.each_with_index do |node, index|
|
29
|
+
new_hash[index] = {}
|
30
|
+
value.each do |k,v|
|
31
|
+
next if k == '__iterator__'
|
32
|
+
new_hash[index][k] = process(v, node)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
new_hash
|
37
|
+
else
|
38
|
+
new_hash = {}
|
39
|
+
value.each do |k,v|
|
40
|
+
new_hash[k] = process(v, context)
|
41
|
+
end
|
42
|
+
new_hash
|
43
|
+
end
|
44
|
+
when Array
|
45
|
+
value.map{ |v| process(v, context) }
|
46
|
+
when String
|
47
|
+
matcher = value.scan(/<%\s?+(.*?)\s?+%>/)
|
48
|
+
# <% single selector %>
|
49
|
+
if matcher and matcher.size == 1
|
50
|
+
expression_and_filters = value[2...-2].split('|').map(&:strip)
|
51
|
+
expression = expression_and_filters[0].strip
|
52
|
+
filters = expression_and_filters[1..-1]
|
53
|
+
filters = ['first', 'text', 'strip'] unless filters.present?
|
54
|
+
|
55
|
+
result = context.search(expression)
|
56
|
+
result ? Filter.apply(result, filters) : nil
|
57
|
+
# <% selector %> --- <% another selector %>
|
58
|
+
elsif matcher and matcher.size > 1
|
59
|
+
value.gsub(/<%\s?+(.*?)\s?+%>/) { process("<% #{$1.strip} %>", context) }
|
60
|
+
else
|
61
|
+
value
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/apify_core.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "apify_core/version"
|
2
|
+
require "apify_core/parser"
|
3
|
+
require "apify_core/filter"
|
4
|
+
require "apify_core/fetcher"
|
5
|
+
|
6
|
+
module Apify
|
7
|
+
module Core
|
8
|
+
def self.crawl!( pages, processes=2, delay=0 )
|
9
|
+
fetcher = Fetcher.new(pages.with_indifferent_access, processes, delay); nil
|
10
|
+
fetcher.prepare; nil
|
11
|
+
fetcher.perform; nil
|
12
|
+
fetcher
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.root
|
16
|
+
File.expand_path '../..', __FILE__
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|