scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,27 +1,27 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
##
|
3
3
|
#=<tt>Rejecting result instances based on further rules</tt>
|
4
|
-
#
|
4
|
+
#
|
5
5
|
#The two most trivial problems with a set of rules is that they match either less
|
6
6
|
#or more instances than we would like them to. Constraints are a way to remedy the second problem:
|
7
7
|
#they serve as a tool to filter out some result instances based on rules. A typical
|
8
8
|
#example:
|
9
|
-
#
|
9
|
+
#
|
10
10
|
#* *ensure_presence_of_ancestor_pattern* consider this model:
|
11
11
|
# <book>
|
12
12
|
# <author>...</author>
|
13
13
|
# <title>...</title>
|
14
14
|
# </book>
|
15
|
-
#
|
15
|
+
#
|
16
16
|
#If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
|
17
|
-
#'author' and 'title', only those books will be matched which have an author and a
|
17
|
+
#'author' and 'title', only those books will be matched which have an author and a
|
18
18
|
#title (i.e.the child patterns author and title must extract something). This is a way
|
19
|
-
#to say 'a book MUST have an author and a title'.
|
19
|
+
#to say 'a book MUST have an author and a title'.
|
20
20
|
class Constraint
|
21
21
|
#There are more possible ways of applying/checking constraints in the case of
|
22
|
-
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
22
|
+
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
23
23
|
#since it may require the evaluation of child patterns of the context pattern to
|
24
|
-
#arbitray level)
|
24
|
+
#arbitray level)
|
25
25
|
#
|
26
26
|
#In such cases, the possibilities are:
|
27
27
|
#
|
@@ -29,56 +29,54 @@ module Scrubyt
|
|
29
29
|
# pattern is evaluated. This can mess things up, since if any ancestor node uses
|
30
30
|
# the sinks of predecessor(s) other than the context node, those need to be evaluated
|
31
31
|
# too, and we may run into a cyclyc dependency or at least a complicated recursion
|
32
|
-
#
|
33
|
-
#2) Post processing - evaluate normally and throw out results which do not pass the
|
32
|
+
#
|
33
|
+
#2) Post processing - evaluate normally and throw out results which do not pass the
|
34
34
|
# constraint
|
35
35
|
#
|
36
36
|
#2b) Do it on the XML level - most probably this solution will be implemented
|
37
|
-
|
37
|
+
|
38
38
|
# Different constraint types
|
39
39
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
|
40
40
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
|
41
41
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
|
42
42
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
|
43
43
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
|
44
|
-
|
45
|
-
|
46
|
-
attr_reader :type, :target
|
47
|
-
|
44
|
+
|
45
|
+
|
46
|
+
attr_reader :type, :target
|
47
|
+
|
48
48
|
#Add 'ensure presence of ancestor pattern' constraint
|
49
|
-
|
49
|
+
|
50
50
|
#If this type of constraint is added to a pattern, it must have an ancestor pattern
|
51
51
|
#(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
|
52
52
|
#'Has an ancestor pattern' means that the ancestor pattern actually extracts something
|
53
53
|
#(just by looking at the wrapper model, the ancestor pattern is always present)
|
54
54
|
#Note that from this type of constraint there is no 'ensure_absence' version, since
|
55
|
-
#I could not think about an use case for that
|
56
|
-
def self.add_ensure_presence_of_pattern(
|
57
|
-
Constraint.new(
|
55
|
+
#I could not think about an use case for that
|
56
|
+
def self.add_ensure_presence_of_pattern(ancestor)
|
57
|
+
Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
#Add 'ensure absence of attribute' constraint
|
61
|
-
|
61
|
+
|
62
62
|
#If this type of constraint is added to a pattern, the HTML node it targets
|
63
63
|
#must NOT have an attribute named "attribute_name" with the value "attribute_value"
|
64
|
-
def self.add_ensure_absence_of_attribute(
|
65
|
-
Constraint.new(
|
66
|
-
attribute_hash,
|
64
|
+
def self.add_ensure_absence_of_attribute(attribute_hash)
|
65
|
+
Constraint.new(attribute_hash,
|
67
66
|
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
68
67
|
end
|
69
|
-
|
68
|
+
|
70
69
|
#Add 'ensure presence of attribute' constraint
|
71
|
-
|
70
|
+
|
72
71
|
#If this type of constraint is added to a pattern, the HTML node it targets
|
73
72
|
#must have an attribute named "attribute_name" with the value "attribute_value"
|
74
|
-
def self.add_ensure_presence_of_attribute(
|
75
|
-
Constraint.new(
|
76
|
-
attribute_hash,
|
73
|
+
def self.add_ensure_presence_of_attribute(attribute_hash)
|
74
|
+
Constraint.new(attribute_hash,
|
77
75
|
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
78
76
|
end
|
79
|
-
|
80
|
-
#Add 'ensure absence of ancestor node' constraint
|
81
|
-
|
77
|
+
|
78
|
+
#Add 'ensure absence of ancestor node' constraint
|
79
|
+
|
82
80
|
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
83
81
|
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
84
82
|
#
|
@@ -88,14 +86,13 @@ module Scrubyt
|
|
88
86
|
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
89
87
|
#
|
90
88
|
#"attributes" can be empty - in this case just the 'node_name' is checked
|
91
|
-
def self.add_ensure_absence_of_ancestor_node(
|
92
|
-
Constraint.new(
|
93
|
-
|
94
|
-
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
89
|
+
def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
|
90
|
+
Constraint.new([node_name, attributes],
|
91
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
95
92
|
end
|
96
|
-
|
97
|
-
#Add 'ensure presence of ancestor node' constraint
|
98
|
-
|
93
|
+
|
94
|
+
#Add 'ensure presence of ancestor node' constraint
|
95
|
+
|
99
96
|
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
100
97
|
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
101
98
|
#
|
@@ -105,12 +102,11 @@ module Scrubyt
|
|
105
102
|
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
106
103
|
#
|
107
104
|
#"attributes" can be empty - in this case just the 'node_name' is checked
|
108
|
-
def self.add_ensure_presence_of_ancestor_node(
|
109
|
-
Constraint.new(
|
110
|
-
|
111
|
-
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
105
|
+
def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
|
106
|
+
Constraint.new([node_name, attributes],
|
107
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
112
108
|
end
|
113
|
-
|
109
|
+
|
114
110
|
#Evaluate the constraint; if this function returns true,
|
115
111
|
#it means that the constraint passed, i.e. its filter will be added to the exctracted
|
116
112
|
#content of the pattern
|
@@ -123,21 +119,20 @@ module Scrubyt
|
|
123
119
|
attribute_present(result)
|
124
120
|
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
|
125
121
|
!attribute_present(result)
|
126
|
-
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
127
|
-
ancestor_node_present(result)
|
128
|
-
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
122
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
123
|
+
ancestor_node_present(result)
|
124
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
129
125
|
!ancestor_node_present(result)
|
130
126
|
end
|
131
127
|
end
|
132
|
-
|
128
|
+
|
133
129
|
private
|
134
130
|
#We would not like these to be called from outside
|
135
|
-
def initialize(
|
136
|
-
@type = type
|
137
|
-
@parent_filter = parent_filter
|
131
|
+
def initialize(target, type)
|
138
132
|
@target = target
|
139
|
-
|
140
|
-
|
133
|
+
@type = type
|
134
|
+
end
|
135
|
+
|
141
136
|
#Implementation of the ancestor node presence test
|
142
137
|
#Check the documentation of the add_ensure_presence_of_ancestor_node method
|
143
138
|
#for further information on the result parameter
|
@@ -153,21 +148,22 @@ module Scrubyt
|
|
153
148
|
end
|
154
149
|
false
|
155
150
|
end
|
156
|
-
|
151
|
+
|
157
152
|
def attribute_present(result)
|
153
|
+
return unless result.is_a? Hpricot::Elem
|
158
154
|
match = true
|
159
155
|
#If v = nil, the value of the attribute can be arbitrary;
|
160
156
|
#Therefore, in this case we just have to make sure that the attribute is
|
161
157
|
#present (i.e. != nil), we don't care about the value
|
162
158
|
@target.each do |k,v|
|
163
159
|
if v == nil
|
164
|
-
match &&= (result.attributes[k.to_s] != nil)
|
160
|
+
match &&= (result.attributes[k.to_s] != nil)
|
165
161
|
else
|
166
|
-
match &&= (result.attributes[k.to_s] == v.to_s)
|
167
|
-
end
|
162
|
+
match &&= (result.attributes[k.to_s] == v.to_s)
|
163
|
+
end
|
168
164
|
end
|
169
165
|
match
|
170
166
|
end
|
171
|
-
|
167
|
+
|
172
168
|
end #end of class
|
173
169
|
end #end of module
|
@@ -10,58 +10,35 @@ module Scrubyt
|
|
10
10
|
#I will not document the functions since these are just forwarders; See the 'real'
|
11
11
|
#functions with their documentation in Scrubyt::Constraint.rb
|
12
12
|
class ConstraintAdder
|
13
|
-
|
14
|
-
def self.ensure_presence_of_pattern(
|
15
|
-
|
16
|
-
pattern #To make chaining possible
|
13
|
+
|
14
|
+
def self.ensure_presence_of_pattern(ancestor_node_name)
|
15
|
+
Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
|
17
16
|
end
|
18
17
|
|
19
|
-
def self.ensure_presence_of_ancestor_node(
|
20
|
-
|
21
|
-
|
22
|
-
pattern #To make chaining possible
|
18
|
+
def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
|
19
|
+
Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
|
20
|
+
prepare_attributes(attributes))
|
23
21
|
end
|
24
22
|
|
25
|
-
def self.ensure_absence_of_ancestor_node(
|
26
|
-
|
23
|
+
def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
|
24
|
+
Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
|
27
25
|
prepare_attributes(attributes))
|
28
|
-
pattern #To make chaining possible
|
29
26
|
end
|
30
27
|
|
31
|
-
def self.ensure_presence_of_attribute(
|
32
|
-
|
33
|
-
pattern #To make chaining possible
|
28
|
+
def self.ensure_presence_of_attribute(attribute_hash)
|
29
|
+
Constraint.add_ensure_presence_of_attribute(attribute_hash)
|
34
30
|
end
|
35
31
|
|
36
|
-
def self.ensure_absence_of_attribute(
|
37
|
-
|
38
|
-
pattern #To make chaining possible
|
32
|
+
def self.ensure_absence_of_attribute(attribute_hash)
|
33
|
+
Constraint.add_ensure_absence_of_attribute(attribute_hash)
|
39
34
|
end
|
40
|
-
|
41
|
-
private
|
42
|
-
def self.find_by_name(root_pattern, name)
|
43
|
-
@found_pattern = nil
|
44
|
-
find_by_name_recursive(root_pattern, name)
|
45
|
-
if (@found_pattern == nil)
|
46
|
-
#$Logger.error("Fatal: No pattern named #{name} exists!")
|
47
|
-
puts "Fatal: No pattern named #{name} exists!"
|
48
|
-
end
|
49
|
-
@found_pattern
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.find_by_name_recursive(pattern, name)
|
53
|
-
if pattern.name == name
|
54
|
-
@found_pattern = pattern
|
55
|
-
else
|
56
|
-
pattern.children.each {|child| find_by_name_recursive(child, name)}
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
35
|
+
|
36
|
+
private
|
60
37
|
def self.prepare_attributes(attributes)
|
61
38
|
attribute_pairs = []
|
62
39
|
attributes.each do |key, value|
|
63
40
|
if (value.instance_of? Array)
|
64
|
-
value.each {|val| attribute_pairs << [key,val]}
|
41
|
+
value.each {|val| attribute_pairs << [key,val]}
|
65
42
|
else
|
66
43
|
attribute_pairs << [key, value]
|
67
44
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class AttributeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
|
6
|
+
if elem.is_a? Hpricot::Elem
|
7
|
+
return [elem.attributes[@example]]
|
8
|
+
else
|
9
|
+
return nil
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_sexp
|
14
|
+
[:str, @example]
|
15
|
+
end #end of method to_sexp
|
16
|
+
end #End of class AttributeFilter
|
17
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Filter out relevant pieces from the parent pattern</tt>
|
4
|
+
#
|
5
|
+
#A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
|
6
|
+
#it reaches the bottom. The biggest difference is that instead of water, a HTML
|
7
|
+
#document travels through the space.
|
8
|
+
#
|
9
|
+
#Of course Scrubyt would not make much sense if the same document would arrive at
|
10
|
+
#the bottom that was poured in at the top - since in this case we might use an
|
11
|
+
#indentity transformation (i.e. do nothing with the input) as well.
|
12
|
+
#
|
13
|
+
#This is where filters came in: as they name says, they filter the stuff that is
|
14
|
+
#pouring from above, to leave the interesting parts and discard the rest.
|
15
|
+
#The working of a filter will be explained most easily by the help of an example.
|
16
|
+
#Let's consider that we would like to extract information from a webshop; Concretely
|
17
|
+
#we are interested in the name of the items and the URL pointing to the image of the
|
18
|
+
#item.
|
19
|
+
#
|
20
|
+
#To accomplish this, first we select the items with the pattern item (a pattern is
|
21
|
+
#a logical grouping of fillters; see Pattern documentation) Then our new
|
22
|
+
#context is the result extracted by the 'item' pattern; For every 'item' pattern, further
|
23
|
+
#extract the name and the image of the item; and finally, extract the href attribute
|
24
|
+
#of the image. Let's see an illustration:
|
25
|
+
#
|
26
|
+
# root --> This pattern is called a 'root pattern', It is invisible to you
|
27
|
+
# | and basically it represents the document; it has no filters
|
28
|
+
# +-- item --> Filter what's coming from above (the whole document) to get
|
29
|
+
# | relevant pieces of data (in this case webshop items)
|
30
|
+
# +-- name --> Again, filter what's coming from above (a webshop item) and
|
31
|
+
# | leave only item names after this operation
|
32
|
+
# +-- image --> This time filter the image of the item
|
33
|
+
# |
|
34
|
+
# +-- href --> And finally, from the image elements, get the attribute 'href'
|
35
|
+
class BaseFilter
|
36
|
+
#Type of the example this filter is extracted with
|
37
|
+
|
38
|
+
#XPath example, like html/body/tr/td[1] etc.
|
39
|
+
EXAMPLE_TYPE_XPATH = 0
|
40
|
+
#String from the document, for example 'Canon EOS 300 D'.
|
41
|
+
EXAMPLE_TYPE_STRING = 1
|
42
|
+
#Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
|
43
|
+
EXAMPLE_TYPE_IMAGE = 2
|
44
|
+
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
|
+
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
+
#Regexp example, like /\d+@*\d+[a-z]/
|
47
|
+
EXAMPLE_TYPE_REGEXP = 4
|
48
|
+
#Compound example, like :contains => 'goodies'
|
49
|
+
EXAMPLE_TYPE_COMPOUND = 5
|
50
|
+
|
51
|
+
attr_accessor(:example_type, :parent_pattern, :temp_sink,
|
52
|
+
:constraints, :xpath, :regexp, :example, :source, :sink)
|
53
|
+
|
54
|
+
def self.create(parent_pattern, example=nil)
|
55
|
+
|
56
|
+
filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
|
57
|
+
if filter_name == 'RootFilter'
|
58
|
+
BaseFilter.new(parent_pattern, example)
|
59
|
+
else
|
60
|
+
instance_eval("#{filter_name}.new(parent_pattern, example)")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
65
|
+
#should not be called directly
|
66
|
+
#TODO still used?
|
67
|
+
def method_missing(method_name, *args, &block)
|
68
|
+
case method_name.to_s
|
69
|
+
when /^ensure.+/
|
70
|
+
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
71
|
+
else
|
72
|
+
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_sexp
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
#We don't want this to be accessible from outside
|
82
|
+
def initialize(parent_pattern, example)
|
83
|
+
@example_type = BaseFilter.determine_example_type(example)
|
84
|
+
@parent_pattern = parent_pattern
|
85
|
+
@sink = [] #output of a filter
|
86
|
+
@source = [] #input of a filter
|
87
|
+
@example = example
|
88
|
+
@xpath = nil #The xpath to evaluate this filter
|
89
|
+
@constraints = [] #list of constraints
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.determine_example_type(example)
|
93
|
+
if example.instance_of? Regexp
|
94
|
+
EXAMPLE_TYPE_REGEXP
|
95
|
+
elsif example.instance_of? Hash
|
96
|
+
EXAMPLE_TYPE_COMPOUND
|
97
|
+
else
|
98
|
+
case example
|
99
|
+
when nil
|
100
|
+
EXAMPLE_TYPE_CHILDREN
|
101
|
+
when /\.(jpg|png|gif|jpeg)$/
|
102
|
+
EXAMPLE_TYPE_IMAGE
|
103
|
+
when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
|
104
|
+
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
105
|
+
else
|
106
|
+
EXAMPLE_TYPE_STRING
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end #end of method
|
110
|
+
end #End of class
|
111
|
+
end #End of module
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class DetailPageFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a? String
|
6
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
|
7
|
+
else
|
8
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
9
|
+
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
10
|
+
@parent_pattern, @parent_pattern.resolve)
|
11
|
+
end
|
12
|
+
end #end of method
|
13
|
+
end #End of class DetailPageFilter
|
14
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
class DownloadFilter < BaseFilter
|
6
|
+
|
7
|
+
def evaluate(source)
|
8
|
+
download_file(source)
|
9
|
+
end #end of method
|
10
|
+
|
11
|
+
def to_sexp
|
12
|
+
[:str, @example]
|
13
|
+
end #end of method to_sexp
|
14
|
+
|
15
|
+
private
|
16
|
+
def download_file(source)
|
17
|
+
host_name = @parent_pattern.evaluation_context.extractor.get_host_name
|
18
|
+
outfile = nil
|
19
|
+
base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
|
20
|
+
return '' if source.size < 4
|
21
|
+
file_name = source.scan(/.+\/(.*)/)[0][0]
|
22
|
+
Net::HTTP.start(base_url) { |http|
|
23
|
+
resp = http.get(source)
|
24
|
+
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
25
|
+
FileUtils.mkdir_p @example
|
26
|
+
open(outfile, 'wb') {|f| f.write(resp.body) }
|
27
|
+
}
|
28
|
+
outfile.scan(/.+\/(.*)/)[0][0]
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.find_nonexisting_file_name(file_name)
|
32
|
+
already_found = false
|
33
|
+
loop do
|
34
|
+
if File.exists? file_name
|
35
|
+
if already_found
|
36
|
+
last_no = file_name.scan(/_(\d+)\./)[0][0]
|
37
|
+
file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
|
38
|
+
else
|
39
|
+
file_name.sub!(/\./) {"_1\."}
|
40
|
+
already_found = true
|
41
|
+
end
|
42
|
+
else
|
43
|
+
break
|
44
|
+
end
|
45
|
+
end
|
46
|
+
file_name
|
47
|
+
end #end of method
|
48
|
+
end #End of class DownloadFilter
|
49
|
+
end #End of module Scrubyt
|