apify_core 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ module Apify
2
+ module Core
3
+ class Parser
4
+
5
+ def initialize(html="", pattern={})
6
+ @html, @pattern, @doc, @result = html, pattern, ::Nokogiri::HTML(html), {}
7
+ end
8
+
9
+ def perform
10
+ @pattern.each do |key,value|
11
+ next if key == '__iterator__'
12
+ @result[key] = process(value, @doc)
13
+ end
14
+ @result
15
+ end
16
+
17
+ def self.fetch(expression, docs)
18
+ docs.map{ |html| ::Nokogiri::HTML(html).search(expression) }.flatten
19
+ end
20
+
21
+ private
22
+ def process(value, context)
23
+ case value
24
+ when Hash
25
+ if value.keys.include?('__iterator__')
26
+ new_hash = []
27
+ context = context.search(value['__iterator__'].strip)
28
+ context.each_with_index do |node, index|
29
+ new_hash[index] = {}
30
+ value.each do |k,v|
31
+ next if k == '__iterator__'
32
+ new_hash[index][k] = process(v, node)
33
+ end
34
+ end
35
+
36
+ new_hash
37
+ else
38
+ new_hash = {}
39
+ value.each do |k,v|
40
+ new_hash[k] = process(v, context)
41
+ end
42
+ new_hash
43
+ end
44
+ when Array
45
+ value.map{ |v| process(v, context) }
46
+ when String
47
+ matcher = value.scan(/<%\s?+(.*?)\s?+%>/)
48
+ # <% single selector %>
49
+ if matcher and matcher.size == 1
50
+ expression_and_filters = value[2...-2].split('|').map(&:strip)
51
+ expression = expression_and_filters[0].strip
52
+ filters = expression_and_filters[1..-1]
53
+ filters = ['first', 'text', 'strip'] unless filters.present?
54
+
55
+ result = context.search(expression)
56
+ result ? Filter.apply(result, filters) : nil
57
+ # <% selector %> --- <% another selector %>
58
+ elsif matcher and matcher.size > 1
59
+ value.gsub(/<%\s?+(.*?)\s?+%>/) { process("<% #{$1.strip} %>", context) }
60
+ else
61
+ value
62
+ end
63
+ end
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,5 @@
1
+ module Apify
2
+ module Core
3
+ VERSION = "0.1.0"
4
+ end
5
+ end
data/lib/apify_core.rb ADDED
@@ -0,0 +1,19 @@
1
+ require "apify_core/version"
2
+ require "apify_core/parser"
3
+ require "apify_core/filter"
4
+ require "apify_core/fetcher"
5
+
6
+ module Apify
7
+ module Core
8
+ def self.crawl!( pages, processes=2, delay=0 )
9
+ fetcher = Fetcher.new(pages.with_indifferent_access, processes, delay); nil
10
+ fetcher.prepare; nil
11
+ fetcher.perform; nil
12
+ fetcher
13
+ end
14
+
15
+ def self.root
16
+ File.expand_path '../..', __FILE__
17
+ end
18
+ end
19
+ end