apify_core 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +79 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +3 -0
- data/apify_core.gemspec +34 -0
- data/bin/bundler +16 -0
- data/bin/coderay +16 -0
- data/bin/htmldiff +16 -0
- data/bin/ldiff +16 -0
- data/bin/nokogiri +16 -0
- data/bin/pry +16 -0
- data/bin/rackup +16 -0
- data/bin/rake +16 -0
- data/bin/rspec +16 -0
- data/bin/server +16 -0
- data/bin/tilt +16 -0
- data/lib/apify_core/fetcher.rb +190 -0
- data/lib/apify_core/filter.rb +83 -0
- data/lib/apify_core/parser.rb +68 -0
- data/lib/apify_core/version.rb +5 -0
- data/lib/apify_core.rb +19 -0
- data/spec/complex_spec.rb +736 -0
- data/spec/examples/apify_request.json +62 -0
- data/spec/examples/apify_response.json +1399 -0
- data/spec/examples/github_blog_request.json +24 -0
- data/spec/examples/oblomoff_events_request.json +21 -0
- data/spec/examples/vgorode_dn_events_request.json +23 -0
- data/spec/examples/vgorode_dp_events_request.json +23 -0
- data/spec/examples/vgorode_kh_events_request.json +23 -0
- data/spec/examples/vgorode_kiev_events_request.json +23 -0
- data/spec/examples/vgorode_lg_events_request.json +23 -0
- data/spec/examples/vgorode_lviv_events_request.json +23 -0
- data/spec/examples/vgorode_od_events_request.json +23 -0
- data/spec/examples/vgorode_zp_events_request.json +23 -0
- data/spec/spec_helper.rb +8 -0
- metadata +247 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
def initialize(html="", pattern={})
|
6
|
+
@html, @pattern, @doc, @result = html, pattern, ::Nokogiri::HTML(html), {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def perform
|
10
|
+
@pattern.each do |key,value|
|
11
|
+
next if key == '__iterator__'
|
12
|
+
@result[key] = process(value, @doc)
|
13
|
+
end
|
14
|
+
@result
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.fetch(expression, docs)
|
18
|
+
docs.map{ |html| ::Nokogiri::HTML(html).search(expression) }.flatten
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def process(value, context)
|
23
|
+
case value
|
24
|
+
when Hash
|
25
|
+
if value.keys.include?('__iterator__')
|
26
|
+
new_hash = []
|
27
|
+
context = context.search(value['__iterator__'].strip)
|
28
|
+
context.each_with_index do |node, index|
|
29
|
+
new_hash[index] = {}
|
30
|
+
value.each do |k,v|
|
31
|
+
next if k == '__iterator__'
|
32
|
+
new_hash[index][k] = process(v, node)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
new_hash
|
37
|
+
else
|
38
|
+
new_hash = {}
|
39
|
+
value.each do |k,v|
|
40
|
+
new_hash[k] = process(v, context)
|
41
|
+
end
|
42
|
+
new_hash
|
43
|
+
end
|
44
|
+
when Array
|
45
|
+
value.map{ |v| process(v, context) }
|
46
|
+
when String
|
47
|
+
matcher = value.scan(/<%\s?+(.*?)\s?+%>/)
|
48
|
+
# <% single selector %>
|
49
|
+
if matcher and matcher.size == 1
|
50
|
+
expression_and_filters = value[2...-2].split('|').map(&:strip)
|
51
|
+
expression = expression_and_filters[0].strip
|
52
|
+
filters = expression_and_filters[1..-1]
|
53
|
+
filters = ['first', 'text', 'strip'] unless filters.present?
|
54
|
+
|
55
|
+
result = context.search(expression)
|
56
|
+
result ? Filter.apply(result, filters) : nil
|
57
|
+
# <% selector %> --- <% another selector %>
|
58
|
+
elsif matcher and matcher.size > 1
|
59
|
+
value.gsub(/<%\s?+(.*?)\s?+%>/) { process("<% #{$1.strip} %>", context) }
|
60
|
+
else
|
61
|
+
value
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/apify_core.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "apify_core/version"
|
2
|
+
require "apify_core/parser"
|
3
|
+
require "apify_core/filter"
|
4
|
+
require "apify_core/fetcher"
|
5
|
+
|
6
|
+
module Apify
|
7
|
+
module Core
|
8
|
+
def self.crawl!( pages, processes=2, delay=0 )
|
9
|
+
fetcher = Fetcher.new(pages.with_indifferent_access, processes, delay); nil
|
10
|
+
fetcher.prepare; nil
|
11
|
+
fetcher.perform; nil
|
12
|
+
fetcher
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.root
|
16
|
+
File.expand_path '../..', __FILE__
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|