scrubyt 0.2.6 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,27 +1,27 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
##
|
3
3
|
#=<tt>Rejecting result instances based on further rules</tt>
|
4
|
-
#
|
4
|
+
#
|
5
5
|
#The two most trivial problems with a set of rules is that they match either less
|
6
6
|
#or more instances than we would like them to. Constraints are a way to remedy the second problem:
|
7
7
|
#they serve as a tool to filter out some result instances based on rules. A typical
|
8
8
|
#example:
|
9
|
-
#
|
9
|
+
#
|
10
10
|
#* *ensure_presence_of_ancestor_pattern* consider this model:
|
11
11
|
# <book>
|
12
12
|
# <author>...</author>
|
13
13
|
# <title>...</title>
|
14
14
|
# </book>
|
15
|
-
#
|
15
|
+
#
|
16
16
|
#If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
|
17
|
-
#'author' and 'title', only those books will be matched which have an author and a
|
17
|
+
#'author' and 'title', only those books will be matched which have an author and a
|
18
18
|
#title (i.e.the child patterns author and title must extract something). This is a way
|
19
|
-
#to say 'a book MUST have an author and a title'.
|
19
|
+
#to say 'a book MUST have an author and a title'.
|
20
20
|
class Constraint
|
21
21
|
#There are more possible ways of applying/checking constraints in the case of
|
22
|
-
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
22
|
+
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
23
23
|
#since it may require the evaluation of child patterns of the context pattern to
|
24
|
-
#arbitray level)
|
24
|
+
#arbitray level)
|
25
25
|
#
|
26
26
|
#In such cases, the possibilities are:
|
27
27
|
#
|
@@ -29,56 +29,54 @@ module Scrubyt
|
|
29
29
|
# pattern is evaluated. This can mess things up, since if any ancestor node uses
|
30
30
|
# the sinks of predecessor(s) other than the context node, those need to be evaluated
|
31
31
|
# too, and we may run into a cyclyc dependency or at least a complicated recursion
|
32
|
-
#
|
33
|
-
#2) Post processing - evaluate normally and throw out results which do not pass the
|
32
|
+
#
|
33
|
+
#2) Post processing - evaluate normally and throw out results which do not pass the
|
34
34
|
# constraint
|
35
35
|
#
|
36
36
|
#2b) Do it on the XML level - most probably this solution will be implemented
|
37
|
-
|
37
|
+
|
38
38
|
# Different constraint types
|
39
39
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
|
40
40
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
|
41
41
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
|
42
42
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
|
43
43
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
|
44
|
-
|
45
|
-
|
46
|
-
attr_reader :type, :target
|
47
|
-
|
44
|
+
|
45
|
+
|
46
|
+
attr_reader :type, :target
|
47
|
+
|
48
48
|
#Add 'ensure presence of ancestor pattern' constraint
|
49
|
-
|
49
|
+
|
50
50
|
#If this type of constraint is added to a pattern, it must have an ancestor pattern
|
51
51
|
#(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
|
52
52
|
#'Has an ancestor pattern' means that the ancestor pattern actually extracts something
|
53
53
|
#(just by looking at the wrapper model, the ancestor pattern is always present)
|
54
54
|
#Note that from this type of constraint there is no 'ensure_absence' version, since
|
55
|
-
#I could not think about an use case for that
|
56
|
-
def self.add_ensure_presence_of_pattern(
|
57
|
-
Constraint.new(
|
55
|
+
#I could not think about an use case for that
|
56
|
+
def self.add_ensure_presence_of_pattern(ancestor)
|
57
|
+
Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
#Add 'ensure absence of attribute' constraint
|
61
|
-
|
61
|
+
|
62
62
|
#If this type of constraint is added to a pattern, the HTML node it targets
|
63
63
|
#must NOT have an attribute named "attribute_name" with the value "attribute_value"
|
64
|
-
def self.add_ensure_absence_of_attribute(
|
65
|
-
Constraint.new(
|
66
|
-
attribute_hash,
|
64
|
+
def self.add_ensure_absence_of_attribute(attribute_hash)
|
65
|
+
Constraint.new(attribute_hash,
|
67
66
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
68
67
|
end
|
69
|
-
|
68
|
+
|
70
69
|
#Add 'ensure presence of attribute' constraint
|
71
|
-
|
70
|
+
|
72
71
|
#If this type of constraint is added to a pattern, the HTML node it targets
|
73
72
|
#must have an attribute named "attribute_name" with the value "attribute_value"
|
74
|
-
def self.add_ensure_presence_of_attribute(
|
75
|
-
Constraint.new(
|
76
|
-
attribute_hash,
|
73
|
+
def self.add_ensure_presence_of_attribute(attribute_hash)
|
74
|
+
Constraint.new(attribute_hash,
|
77
75
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
78
76
|
end
|
79
|
-
|
80
|
-
#Add 'ensure absence of ancestor node' constraint
|
81
|
-
|
77
|
+
|
78
|
+
#Add 'ensure absence of ancestor node' constraint
|
79
|
+
|
82
80
|
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
83
81
|
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
84
82
|
#
|
@@ -88,14 +86,13 @@ module Scrubyt
|
|
88
86
|
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
89
87
|
#
|
90
88
|
#"attributes" can be empty - in this case just the 'node_name' is checked
|
91
|
-
def self.add_ensure_absence_of_ancestor_node(
|
92
|
-
Constraint.new(
|
93
|
-
|
94
|
-
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
89
|
+
def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
|
90
|
+
Constraint.new([node_name, attributes],
|
91
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
95
92
|
end
|
96
|
-
|
97
|
-
#Add 'ensure presence of ancestor node' constraint
|
98
|
-
|
93
|
+
|
94
|
+
#Add 'ensure presence of ancestor node' constraint
|
95
|
+
|
99
96
|
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
100
97
|
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
101
98
|
#
|
@@ -105,12 +102,11 @@ module Scrubyt
|
|
105
102
|
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
106
103
|
#
|
107
104
|
#"attributes" can be empty - in this case just the 'node_name' is checked
|
108
|
-
def self.add_ensure_presence_of_ancestor_node(
|
109
|
-
Constraint.new(
|
110
|
-
|
111
|
-
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
105
|
+
def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
|
106
|
+
Constraint.new([node_name, attributes],
|
107
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
112
108
|
end
|
113
|
-
|
109
|
+
|
114
110
|
#Evaluate the constraint; if this function returns true,
|
115
111
|
#it means that the constraint passed, i.e. its filter will be added to the exctracted
|
116
112
|
#content of the pattern
|
@@ -123,21 +119,20 @@ module Scrubyt
|
|
123
119
|
attribute_present(result)
|
124
120
|
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
|
125
121
|
!attribute_present(result)
|
126
|
-
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
127
|
-
ancestor_node_present(result)
|
128
|
-
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
122
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
123
|
+
ancestor_node_present(result)
|
124
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
129
125
|
!ancestor_node_present(result)
|
130
126
|
end
|
131
127
|
end
|
132
|
-
|
128
|
+
|
133
129
|
private
|
134
130
|
#We would not like these to be called from outside
|
135
|
-
def initialize(
|
136
|
-
@type = type
|
137
|
-
@parent_filter = parent_filter
|
131
|
+
def initialize(target, type)
|
138
132
|
@target = target
|
139
|
-
|
140
|
-
|
133
|
+
@type = type
|
134
|
+
end
|
135
|
+
|
141
136
|
#Implementation of the ancestor node presence test
|
142
137
|
#Check the documentation of the add_ensure_presence_of_ancestor_node method
|
143
138
|
#for further information on the result parameter
|
@@ -153,21 +148,22 @@ module Scrubyt
|
|
153
148
|
end
|
154
149
|
false
|
155
150
|
end
|
156
|
-
|
151
|
+
|
157
152
|
def attribute_present(result)
|
153
|
+
return unless result.is_a? Hpricot::Elem
|
158
154
|
match = true
|
159
155
|
#If v = nil, the value of the attribute can be arbitrary;
|
160
156
|
#Therefore, in this case we just have to make sure that the attribute is
|
161
157
|
#present (i.e. != nil), we don't care about the value
|
162
158
|
@target.each do |k,v|
|
163
159
|
if v == nil
|
164
|
-
match &&= (result.attributes[k.to_s] != nil)
|
160
|
+
match &&= (result.attributes[k.to_s] != nil)
|
165
161
|
else
|
166
|
-
match &&= (result.attributes[k.to_s] == v.to_s)
|
167
|
-
end
|
162
|
+
match &&= (result.attributes[k.to_s] == v.to_s)
|
163
|
+
end
|
168
164
|
end
|
169
165
|
match
|
170
166
|
end
|
171
|
-
|
167
|
+
|
172
168
|
end #end of class
|
173
169
|
end #end of module
|
@@ -10,58 +10,35 @@ module Scrubyt
|
|
10
10
|
#I will not document the functions since these are just forwarders; See the 'real'
|
11
11
|
#functions with their documentation in Scrubyt::Constraint.rb
|
12
12
|
class ConstraintAdder
|
13
|
-
|
14
|
-
def self.ensure_presence_of_pattern(
|
15
|
-
|
16
|
-
pattern #To make chaining possible
|
13
|
+
|
14
|
+
def self.ensure_presence_of_pattern(ancestor_node_name)
|
15
|
+
Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
|
17
16
|
end
|
18
17
|
|
19
|
-
def self.ensure_presence_of_ancestor_node(
|
20
|
-
|
21
|
-
|
22
|
-
pattern #To make chaining possible
|
18
|
+
def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
|
19
|
+
Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
|
20
|
+
prepare_attributes(attributes))
|
23
21
|
end
|
24
22
|
|
25
|
-
def self.ensure_absence_of_ancestor_node(
|
26
|
-
|
23
|
+
def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
|
24
|
+
Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
|
27
25
|
prepare_attributes(attributes))
|
28
|
-
pattern #To make chaining possible
|
29
26
|
end
|
30
27
|
|
31
|
-
def self.ensure_presence_of_attribute(
|
32
|
-
|
33
|
-
pattern #To make chaining possible
|
28
|
+
def self.ensure_presence_of_attribute(attribute_hash)
|
29
|
+
Constraint.add_ensure_presence_of_attribute(attribute_hash)
|
34
30
|
end
|
35
31
|
|
36
|
-
def self.ensure_absence_of_attribute(
|
37
|
-
|
38
|
-
pattern #To make chaining possible
|
32
|
+
def self.ensure_absence_of_attribute(attribute_hash)
|
33
|
+
Constraint.add_ensure_absence_of_attribute(attribute_hash)
|
39
34
|
end
|
40
|
-
|
41
|
-
private
|
42
|
-
def self.find_by_name(root_pattern, name)
|
43
|
-
@found_pattern = nil
|
44
|
-
find_by_name_recursive(root_pattern, name)
|
45
|
-
if (@found_pattern == nil)
|
46
|
-
#$Logger.error("Fatal: No pattern named #{name} exists!")
|
47
|
-
puts "Fatal: No pattern named #{name} exists!"
|
48
|
-
end
|
49
|
-
@found_pattern
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.find_by_name_recursive(pattern, name)
|
53
|
-
if pattern.name == name
|
54
|
-
@found_pattern = pattern
|
55
|
-
else
|
56
|
-
pattern.children.each {|child| find_by_name_recursive(child, name)}
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
35
|
+
|
36
|
+
private
|
60
37
|
def self.prepare_attributes(attributes)
|
61
38
|
attribute_pairs = []
|
62
39
|
attributes.each do |key, value|
|
63
40
|
if (value.instance_of? Array)
|
64
|
-
value.each {|val| attribute_pairs << [key,val]}
|
41
|
+
value.each {|val| attribute_pairs << [key,val]}
|
65
42
|
else
|
66
43
|
attribute_pairs << [key, value]
|
67
44
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class AttributeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
|
6
|
+
if elem.is_a? Hpricot::Elem
|
7
|
+
return [elem.attributes[@example]]
|
8
|
+
else
|
9
|
+
return nil
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_sexp
|
14
|
+
[:str, @example]
|
15
|
+
end #end of method to_sexp
|
16
|
+
end #End of class AttributeFilter
|
17
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Filter out relevant pieces from the parent pattern</tt>
|
4
|
+
#
|
5
|
+
#A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
|
6
|
+
#it reaches the bottom. The biggest difference is that instead of water, a HTML
|
7
|
+
#document travels through the space.
|
8
|
+
#
|
9
|
+
#Of course Scrubyt would not make much sense if the same document would arrive at
|
10
|
+
#the bottom that was poured in at the top - since in this case we might use an
|
11
|
+
#indentity transformation (i.e. do nothing with the input) as well.
|
12
|
+
#
|
13
|
+
#This is where filters came in: as they name says, they filter the stuff that is
|
14
|
+
#pouring from above, to leave the interesting parts and discard the rest.
|
15
|
+
#The working of a filter will be explained most easily by the help of an example.
|
16
|
+
#Let's consider that we would like to extract information from a webshop; Concretely
|
17
|
+
#we are interested in the name of the items and the URL pointing to the image of the
|
18
|
+
#item.
|
19
|
+
#
|
20
|
+
#To accomplish this, first we select the items with the pattern item (a pattern is
|
21
|
+
#a logical grouping of fillters; see Pattern documentation) Then our new
|
22
|
+
#context is the result extracted by the 'item' pattern; For every 'item' pattern, further
|
23
|
+
#extract the name and the image of the item; and finally, extract the href attribute
|
24
|
+
#of the image. Let's see an illustration:
|
25
|
+
#
|
26
|
+
# root --> This pattern is called a 'root pattern', It is invisible to you
|
27
|
+
# | and basically it represents the document; it has no filters
|
28
|
+
# +-- item --> Filter what's coming from above (the whole document) to get
|
29
|
+
# | relevant pieces of data (in this case webshop items)
|
30
|
+
# +-- name --> Again, filter what's coming from above (a webshop item) and
|
31
|
+
# | leave only item names after this operation
|
32
|
+
# +-- image --> This time filter the image of the item
|
33
|
+
# |
|
34
|
+
# +-- href --> And finally, from the image elements, get the attribute 'href'
|
35
|
+
class BaseFilter
|
36
|
+
#Type of the example this filter is extracted with
|
37
|
+
|
38
|
+
#XPath example, like html/body/tr/td[1] etc.
|
39
|
+
EXAMPLE_TYPE_XPATH = 0
|
40
|
+
#String from the document, for example 'Canon EOS 300 D'.
|
41
|
+
EXAMPLE_TYPE_STRING = 1
|
42
|
+
#Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
|
43
|
+
EXAMPLE_TYPE_IMAGE = 2
|
44
|
+
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
|
+
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
+
#Regexp example, like /\d+@*\d+[a-z]/
|
47
|
+
EXAMPLE_TYPE_REGEXP = 4
|
48
|
+
#Compound example, like :contains => 'goodies'
|
49
|
+
EXAMPLE_TYPE_COMPOUND = 5
|
50
|
+
|
51
|
+
attr_accessor(:example_type, :parent_pattern, :temp_sink,
|
52
|
+
:constraints, :xpath, :regexp, :example, :source, :sink)
|
53
|
+
|
54
|
+
def self.create(parent_pattern, example=nil)
|
55
|
+
|
56
|
+
filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
|
57
|
+
if filter_name == 'RootFilter'
|
58
|
+
BaseFilter.new(parent_pattern, example)
|
59
|
+
else
|
60
|
+
instance_eval("#{filter_name}.new(parent_pattern, example)")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
65
|
+
#should not be called directly
|
66
|
+
#TODO still used?
|
67
|
+
def method_missing(method_name, *args, &block)
|
68
|
+
case method_name.to_s
|
69
|
+
when /^ensure.+/
|
70
|
+
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
71
|
+
else
|
72
|
+
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_sexp
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
#We don't want this to be accessible from outside
|
82
|
+
def initialize(parent_pattern, example)
|
83
|
+
@example_type = BaseFilter.determine_example_type(example)
|
84
|
+
@parent_pattern = parent_pattern
|
85
|
+
@sink = [] #output of a filter
|
86
|
+
@source = [] #input of a filter
|
87
|
+
@example = example
|
88
|
+
@xpath = nil #The xpath to evaluate this filter
|
89
|
+
@constraints = [] #list of constraints
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.determine_example_type(example)
|
93
|
+
if example.instance_of? Regexp
|
94
|
+
EXAMPLE_TYPE_REGEXP
|
95
|
+
elsif example.instance_of? Hash
|
96
|
+
EXAMPLE_TYPE_COMPOUND
|
97
|
+
else
|
98
|
+
case example
|
99
|
+
when nil
|
100
|
+
EXAMPLE_TYPE_CHILDREN
|
101
|
+
when /\.(jpg|png|gif|jpeg)$/
|
102
|
+
EXAMPLE_TYPE_IMAGE
|
103
|
+
when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
|
104
|
+
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
105
|
+
else
|
106
|
+
EXAMPLE_TYPE_STRING
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end #end of method
|
110
|
+
end #End of class
|
111
|
+
end #End of module
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class DetailPageFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a? String
|
6
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
|
7
|
+
else
|
8
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
9
|
+
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
10
|
+
@parent_pattern, @parent_pattern.resolve)
|
11
|
+
end
|
12
|
+
end #end of method
|
13
|
+
end #End of class DetailPageFilter
|
14
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
class DownloadFilter < BaseFilter
|
6
|
+
|
7
|
+
def evaluate(source)
|
8
|
+
download_file(source)
|
9
|
+
end #end of method
|
10
|
+
|
11
|
+
def to_sexp
|
12
|
+
[:str, @example]
|
13
|
+
end #end of method to_sexp
|
14
|
+
|
15
|
+
private
|
16
|
+
def download_file(source)
|
17
|
+
host_name = @parent_pattern.evaluation_context.extractor.get_host_name
|
18
|
+
outfile = nil
|
19
|
+
base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
|
20
|
+
return '' if source.size < 4
|
21
|
+
file_name = source.scan(/.+\/(.*)/)[0][0]
|
22
|
+
Net::HTTP.start(base_url) { |http|
|
23
|
+
resp = http.get(source)
|
24
|
+
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
25
|
+
FileUtils.mkdir_p @example
|
26
|
+
open(outfile, 'wb') {|f| f.write(resp.body) }
|
27
|
+
}
|
28
|
+
outfile.scan(/.+\/(.*)/)[0][0]
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.find_nonexisting_file_name(file_name)
|
32
|
+
already_found = false
|
33
|
+
loop do
|
34
|
+
if File.exists? file_name
|
35
|
+
if already_found
|
36
|
+
last_no = file_name.scan(/_(\d+)\./)[0][0]
|
37
|
+
file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
|
38
|
+
else
|
39
|
+
file_name.sub!(/\./) {"_1\."}
|
40
|
+
already_found = true
|
41
|
+
end
|
42
|
+
else
|
43
|
+
break
|
44
|
+
end
|
45
|
+
end
|
46
|
+
file_name
|
47
|
+
end #end of method
|
48
|
+
end #End of class DownloadFilter
|
49
|
+
end #End of module Scrubyt
|