scrubyt 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +41 -0
- data/Rakefile +55 -0
- data/lib/scrubyt.rb +9 -0
- data/lib/scrubyt/constraint.rb +185 -0
- data/lib/scrubyt/constraint_adder.rb +86 -0
- data/lib/scrubyt/export.rb +187 -0
- data/lib/scrubyt/extractor.rb +187 -0
- data/lib/scrubyt/filter.rb +144 -0
- data/lib/scrubyt/pattern.rb +263 -0
- data/lib/scrubyt/result.rb +43 -0
- data/lib/scrubyt/result_dumper.rb +84 -0
- data/lib/scrubyt/xpathutils.rb +196 -0
- data/test/unittests/constraint_test.rb +106 -0
- data/test/unittests/extractor_test.rb +93 -0
- data/test/unittests/filter_test.rb +71 -0
- data/test/unittests/input/constraint_test.html +55 -0
- data/test/unittests/input/test.html +39 -0
- data/test/unittests/xpathutils_test.rb +165 -0
- metadata +63 -0
data/README
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
============================================
|
2
|
+
scRUBYt! - Hpricot and Mechanize on steroids
|
3
|
+
============================================
|
4
|
+
|
5
|
+
A simple to learn and use, yet very powerful web extraction framework written in Ruby. Navigate through the Web, Extract, query, transform and save relevant data from the Web page of interest by the concise and easy to use DSL provided by scRUBYt!.
|
6
|
+
|
7
|
+
=============================================
|
8
|
+
Why do we need one more web-scraping toolkit?
|
9
|
+
=============================================
|
10
|
+
|
11
|
+
After all, we have HPricot, and Rubyful soup, and Mechanize, and scrAPI, and ARIEL and ...
|
12
|
+
Well, because scRUBYt! is different. It has entirely different philosophy, underlying techniques, use cases - shortly it should be used in different situations with different requirements than the previosly mentioned ones.
|
13
|
+
|
14
|
+
If you need something quick and/or would like to have maximal control over the scaping process, I recommend HPricot. Mechanize shines when it comes to Web page navigation. Since scRUBYt! is operating based on XPaths, sometimes you will chose scrAPI because CSS selectors will better suit your needs. The list goes on and on, boiling down to the good old mantra: use the right tool for the right job!
|
15
|
+
|
16
|
+
I hope there will be times when you will want to experiment with Pandora's box and reach after the power of scRUBYt! :-)
|
17
|
+
|
18
|
+
========================================
|
19
|
+
OK, OK, I believe you, what should I do?
|
20
|
+
========================================
|
21
|
+
|
22
|
+
Useful adresses
|
23
|
+
|
24
|
+
scrubyt.rubyforge.org
|
25
|
+
rubyrailways.com (some theory)
|
26
|
+
future: public extractor repository
|
27
|
+
|
28
|
+
==============
|
29
|
+
How to install
|
30
|
+
==============
|
31
|
+
|
32
|
+
ragel
|
33
|
+
hpricot from trunk
|
34
|
+
|
35
|
+
scrubyt gem
|
36
|
+
subversion
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rake/rdoctask'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/gempackagetask'
|
4
|
+
|
5
|
+
###################################################
|
6
|
+
# Dependencies
|
7
|
+
###################################################
|
8
|
+
|
9
|
+
task "default" => ["test"]
|
10
|
+
task "fulltest" => ["test", "blackbox"]
|
11
|
+
|
12
|
+
###################################################
|
13
|
+
# Gem specification
|
14
|
+
###################################################
|
15
|
+
|
16
|
+
gem_spec = Gem::Specification.new do |s|
|
17
|
+
s.name = 'scrubyt'
|
18
|
+
s.version = '0.1.0'
|
19
|
+
s.summary = 'A powerful Web-scraping framework'
|
20
|
+
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
21
|
+
# Files containing Test::Unit test cases.
|
22
|
+
s.test_files = FileList['test/unittests/**/*']
|
23
|
+
# List of other files to be included.
|
24
|
+
s.files = FileList['README', 'Rakefile', 'lib/**/*.rb']
|
25
|
+
s.author = 'Peter Szinek'
|
26
|
+
s.email = 'peter@rubyrailways.com'
|
27
|
+
s.homepage = 'http://www.scrubyt.rubyforge.org'
|
28
|
+
end
|
29
|
+
|
30
|
+
###################################################
|
31
|
+
# Tasks
|
32
|
+
###################################################
|
33
|
+
|
34
|
+
Rake::RDocTask.new do |rdoc|
|
35
|
+
files = ['lib/**/*.rb']
|
36
|
+
rdoc.rdoc_files.add(files)
|
37
|
+
rdoc.main = "README" # page to start on
|
38
|
+
rdoc.title = "Scrubyt Documentation"
|
39
|
+
rdoc.template = "resources/allison/allison.rb"
|
40
|
+
rdoc.rdoc_dir = 'doc' # rdoc output folder
|
41
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
42
|
+
end
|
43
|
+
|
44
|
+
Rake::TestTask.new do |test|
|
45
|
+
test.pattern = 'test/unittests/*_test.rb'
|
46
|
+
end
|
47
|
+
|
48
|
+
task "blackbox" do
|
49
|
+
ruby "test/blackbox/run_blackbox_tests.rb"
|
50
|
+
end
|
51
|
+
|
52
|
+
Rake::GemPackageTask.new(gem_spec) do |pkg|
|
53
|
+
pkg.need_zip = false
|
54
|
+
pkg.need_tar = false
|
55
|
+
end
|
data/lib/scrubyt.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'scrubyt/constraint_adder.rb'
|
2
|
+
require 'scrubyt/constraint.rb'
|
3
|
+
require 'scrubyt/export.rb'
|
4
|
+
require 'scrubyt/extractor.rb'
|
5
|
+
require 'scrubyt/filter.rb'
|
6
|
+
require 'scrubyt/pattern.rb'
|
7
|
+
require 'scrubyt/result_dumper.rb'
|
8
|
+
require 'scrubyt/result.rb'
|
9
|
+
require 'scrubyt/xpathutils.rb'
|
@@ -0,0 +1,185 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Rejecting result instances based on further rules</tt>
|
4
|
+
#
|
5
|
+
#The two most trivial problems with a set of rules is that they match either less
|
6
|
+
#or more instances than we would like them to. Constraints are a way to remedy the second problem:
|
7
|
+
#they serve as a tool to filter out some result instances based on rules. A typical
|
8
|
+
#example:
|
9
|
+
#
|
10
|
+
#* *ensure_presence_of_ancestor_pattern* consider this model:
|
11
|
+
# <book>
|
12
|
+
# <author>...</author>
|
13
|
+
# <title>...</title>
|
14
|
+
# </book>
|
15
|
+
#
|
16
|
+
#If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
|
17
|
+
#'author' and 'title', only those books will be matched which have an author and a
|
18
|
+
#title (i.e.the child patterns author and title must extract something). This is a way
|
19
|
+
#to say 'a book MUST have an author and a title'.
|
20
|
+
class Constraint
|
21
|
+
#There are more possible ways of applying/checking constraints in the case of
|
22
|
+
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
23
|
+
#since it may require the evaluation of child patterns of the context pattern to
|
24
|
+
#arbitray level)
|
25
|
+
#
|
26
|
+
#In such cases, the possibilities are:
|
27
|
+
#
|
28
|
+
#1) make a depth-first evaluation from the context pattern until the needed ancestor
|
29
|
+
# pattern is evaluated. This can mess things up, since if any ancestor node uses
|
30
|
+
# the sinks of predecessor(s) other than the context node, those need to be evaluated
|
31
|
+
# too, and we may run into a cyclyc dependency or at least a complicated recursion
|
32
|
+
#
|
33
|
+
#2) Post processing - evaluate normally and throw out results which do not pass the
|
34
|
+
# constraint
|
35
|
+
#
|
36
|
+
#2b) Do it on the XML level - most probably this solution will be implemented
|
37
|
+
|
38
|
+
# Different constraint types
|
39
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_PATTERN = 0
|
40
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN = 1
|
41
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 2
|
42
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 3
|
43
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 4
|
44
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 5
|
45
|
+
|
46
|
+
|
47
|
+
attr_reader :type, :target, :parent_filter
|
48
|
+
|
49
|
+
#Add 'ensure presence of ancestor pattern' constraint
|
50
|
+
|
51
|
+
#If this type of constraint is added to a pattern, it must have an ancestor pattern
|
52
|
+
#(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
|
53
|
+
#'Has an ancestor pattern' means that the ancestor pattern actually extracts something
|
54
|
+
#(just by looking at the wrapper model, the ancestor pattern is always present)
|
55
|
+
#ON result level!!!
|
56
|
+
def self.add_ensure_presence_of_ancestor_pattern(parent_filter, ancestor)
|
57
|
+
Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_PATTERN)
|
58
|
+
end
|
59
|
+
|
60
|
+
#Add 'ensure presence of ancestor pattern' constraint
|
61
|
+
|
62
|
+
#If this type of constraint is added to a pattern, it must NOT have an ancestor pattern
|
63
|
+
#(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
|
64
|
+
#'Has an ancestor pattern' means that the ancestor pattern actually extracts something
|
65
|
+
#(just by looking at the wrapper model, the ancestor pattern is always present)
|
66
|
+
#ON result level!!!
|
67
|
+
def self.add_ensure_absence_of_ancestor_pattern(parent_filter, ancestor)
|
68
|
+
Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN)
|
69
|
+
end
|
70
|
+
|
71
|
+
#Add 'ensure absence of attribute' constraint
|
72
|
+
|
73
|
+
#If this type of constraint is added to a pattern, the HTML node it targets
|
74
|
+
#must NOT have an attribute named "attribute_name" with the value "attribute_value"
|
75
|
+
def self.add_ensure_absence_of_attribute(parent_filter, attribute_hash)
|
76
|
+
Constraint.new(parent_filter,
|
77
|
+
attribute_hash,
|
78
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
79
|
+
end
|
80
|
+
|
81
|
+
#Add 'ensure presence of attribute' constraint
|
82
|
+
|
83
|
+
#If this type of constraint is added to a pattern, the HTML node it targets
|
84
|
+
#must have an attribute named "attribute_name" with the value "attribute_value"
|
85
|
+
def self.add_ensure_presence_of_attribute(parent_filter, attribute_hash)
|
86
|
+
Constraint.new(parent_filter,
|
87
|
+
attribute_hash,
|
88
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
89
|
+
end
|
90
|
+
|
91
|
+
#Add 'ensure absence of ancestor node' constraint
|
92
|
+
|
93
|
+
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
94
|
+
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
95
|
+
#
|
96
|
+
#"attributes" is an array of hashes, for example
|
97
|
+
#[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
|
98
|
+
#in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
|
99
|
+
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
100
|
+
#
|
101
|
+
#"attributes" can be empty - in this case just the 'node_name' is checked
|
102
|
+
def self.add_ensure_absence_of_ancestor_node(parent_filter, node_name, attributes)
|
103
|
+
Constraint.new(parent_filter,
|
104
|
+
[node_name, attributes],
|
105
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
106
|
+
end
|
107
|
+
|
108
|
+
#Add 'ensure presence of ancestor node' constraint
|
109
|
+
|
110
|
+
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
111
|
+
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
112
|
+
#
|
113
|
+
#"attributes" is an array of hashes, for example
|
114
|
+
#[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
|
115
|
+
#in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
|
116
|
+
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
117
|
+
#
|
118
|
+
#"attributes" can be empty - in this case just the 'node_name' is checked
|
119
|
+
def self.add_ensure_presence_of_ancestor_node(parent_filter, node_name, attributes)
|
120
|
+
Constraint.new(parent_filter,
|
121
|
+
[node_name, attributes],
|
122
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
123
|
+
end
|
124
|
+
|
125
|
+
#Evaluate the constraint; if this function returns true,
|
126
|
+
#it means that the constraint passed, i.e. its filter will be added to the exctracted
|
127
|
+
#content of the pattern
|
128
|
+
def check(result)
|
129
|
+
case @type
|
130
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_PATTERN
|
131
|
+
puts "CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_PATTERN"
|
132
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN
|
133
|
+
puts "CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN"
|
134
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
|
135
|
+
attribute_present(result)
|
136
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
|
137
|
+
!attribute_present(result)
|
138
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
139
|
+
ancestor_node_present(result)
|
140
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
141
|
+
!ancestor_node_present(result)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
#We would not like these to be called from outside
|
147
|
+
def initialize(parent_filter, target, type)
|
148
|
+
@type = type
|
149
|
+
@parent_filter = parent_filter
|
150
|
+
@target = target
|
151
|
+
end
|
152
|
+
|
153
|
+
#Implementation of the ancestor node presence test
|
154
|
+
#Check the documentation of the add_ensure_presence_of_ancestor_node method
|
155
|
+
#for further information on the result parameter
|
156
|
+
def ancestor_node_present(result)
|
157
|
+
found = false
|
158
|
+
node_name = @target[0]
|
159
|
+
node_attributes = @target[1]
|
160
|
+
node_attributes.each do |pair|
|
161
|
+
return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
|
162
|
+
end
|
163
|
+
if node_attributes.empty?
|
164
|
+
return true if !result.search("//#{node_name}").empty?
|
165
|
+
end
|
166
|
+
false
|
167
|
+
end
|
168
|
+
|
169
|
+
def attribute_present(result)
|
170
|
+
match = true
|
171
|
+
#If v = nil, the value of the attribute can be arbitrary;
|
172
|
+
#Therefore, in this case we just have to make sure that the attribute is
|
173
|
+
#present (i.e. != nil), we don't care about the value
|
174
|
+
@target.each do |k,v|
|
175
|
+
if v == nil
|
176
|
+
match &&= (result.attributes[k.to_s] != nil)
|
177
|
+
else
|
178
|
+
match &&= (result.attributes[k.to_s] == v.to_s)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
match
|
182
|
+
end
|
183
|
+
|
184
|
+
end #end of class
|
185
|
+
end #end of module
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Utility class for adding constraints</tt>
|
4
|
+
#
|
5
|
+
#Originally methods of Pattern - but since Pattern was already too heavy (and after
|
6
|
+
#all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
|
7
|
+
#to this utility class. In pattern everything that begins with ensure_
|
8
|
+
#is automatically dispatched here.
|
9
|
+
#
|
10
|
+
#I will not document the functions since these are just forwarders; See the 'real'
|
11
|
+
#functions with their documentation in Scrubyt::Constraint.rb
|
12
|
+
class ConstraintAdder
|
13
|
+
|
14
|
+
def self.ensure_presence_of_ancestor_pattern(pattern, ancestor_node_name)
|
15
|
+
data = self.prepare_ensure_ancestor_pattern(pattern, sym_root, sym_ancestor)
|
16
|
+
pattern.filters[0].ensure_presence_of_ancestor_pattern(ancestor_node_name)
|
17
|
+
pattern #To make chaining possible
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.ensure_absence_of_ancestor_pattern(pattern, ancestor_node_name)
|
21
|
+
data = self.prepare_ensure_ancestor_pattern(pattern, sym_root, sym_ancestor)
|
22
|
+
pattern.filters[0].ensure_absence_of_ancestor_pattern(ancestor_node_name)
|
23
|
+
pattern #To make chaining possible
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.ensure_presence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
|
27
|
+
pattern.filters[0].ensure_presence_of_ancestor_node(ancestor_node_name,
|
28
|
+
prepare_attributes(attributes))
|
29
|
+
pattern #To make chaining possible
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.ensure_absence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
|
33
|
+
pattern.filters[0].ensure_absence_of_ancestor_node(ancestor_node_name,
|
34
|
+
prepare_attributes(attributes))
|
35
|
+
pattern #To make chaining possible
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.ensure_presence_of_attribute(pattern, attribute_hash)
|
39
|
+
pattern.filters[0].ensure_presence_of_attribute(attribute_hash)
|
40
|
+
pattern #To make chaining possible
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.ensure_absence_of_attribute(pattern, attribute_hash)
|
44
|
+
pattern.filters[0].ensure_absence_of_attribute(attribute_hash)
|
45
|
+
pattern #To make chaining possible
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def self.find_by_name(root_pattern, name)
|
50
|
+
@found_pattern = nil
|
51
|
+
find_by_name_recursive(root_pattern, name)
|
52
|
+
if (@found_pattern == nil)
|
53
|
+
#$Logger.error("Fatal: No pattern named #{name} exists!")
|
54
|
+
puts "Fatal: No pattern named #{name} exists!"
|
55
|
+
end
|
56
|
+
@found_pattern
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.find_by_name_recursive(pattern, name)
|
60
|
+
if pattern.name == name
|
61
|
+
@found_pattern = pattern
|
62
|
+
else
|
63
|
+
pattern.children.each {|child| find_by_name_recursive(child, name)}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.prepare_attributes(attributes)
|
68
|
+
attribute_pairs = []
|
69
|
+
attributes.each do |key, value|
|
70
|
+
if (value.instance_of? Array)
|
71
|
+
value.each {|val| attribute_pairs << [key,val]}
|
72
|
+
else
|
73
|
+
attribute_pairs << [key, value]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
return attribute_pairs
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.prepare_ensure_ancestor_pattern(pattern, root, ancestor)
|
80
|
+
context_pattern = find_by_name(pattern.root_pattern, root)
|
81
|
+
target_pattern = find_by_name(pattern.root_pattern, ancestor)
|
82
|
+
return [context_pattern, target_pattern]
|
83
|
+
end
|
84
|
+
|
85
|
+
end #end of class ConstraintAddere
|
86
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,187 @@
|
|
1
|
+
#require File.join(File.dirname(__FILE__), 'pattern.rb')
|
2
|
+
|
3
|
+
module Scrubyt
|
4
|
+
# =<tt>exporting previously defined extractors</tt>
|
5
|
+
class Export
|
6
|
+
##
|
7
|
+
#Exports the given extractor (specified by it's root pattern) from the given file
|
8
|
+
#
|
9
|
+
#_input_file_ - the full path of the file where the extractor was defined. This can
|
10
|
+
#be achieved by calling
|
11
|
+
#
|
12
|
+
# pattern.export(__File__)
|
13
|
+
#
|
14
|
+
#from the file of the extractor definition.
|
15
|
+
#
|
16
|
+
#*parameters*
|
17
|
+
#
|
18
|
+
#_pattern_ - the root pattern of the extractor. This is the variable 'something' in
|
19
|
+
#such a call:
|
20
|
+
#
|
21
|
+
# something = Scrubyt::Extractor.define ...
|
22
|
+
#
|
23
|
+
#However, since the export method should not be called directly (pattern is calling
|
24
|
+
#it), you will probably never need to care about this parameter.
|
25
|
+
#
|
26
|
+
#_output_file_name_ - the name of the file where the exported extractor should be
|
27
|
+
#dumped; From default (i.e. if you don't specify this parameter) this is
|
28
|
+
#"#{wrapper_name}_extractor_export.rb". You may override this setting if specifying
|
29
|
+
#this optional parameter.
|
30
|
+
#
|
31
|
+
#_extractor_result_file_name_ - the name of the file, where the result of the
|
32
|
+
#*exported* extractor should be dumped - for example, if _output_file_name_ is "foo.rb"
|
33
|
+
#and _extractor_result_file_name_ is "bar.xml", the extractor is exported to a file named
|
34
|
+
#"foo.rb", and after running "foo.rb", the results will be dumped to the file "bar.xml"
|
35
|
+
#If this option is not specified, the result is dumped to standard output as XML.
|
36
|
+
#
|
37
|
+
#Examples:
|
38
|
+
#
|
39
|
+
# camera_data = Scrubyt::Extractor.define do
|
40
|
+
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
41
|
+
#
|
42
|
+
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
43
|
+
# end
|
44
|
+
#
|
45
|
+
# camera_data.export(__FILE__)
|
46
|
+
#
|
47
|
+
#This will export this extractor to a file called "camera_data_extractor_export.rb".
|
48
|
+
#If "camera_data_extractor_export.rb" will be executed, the result will be dumped
|
49
|
+
#to the standard output.
|
50
|
+
#
|
51
|
+
#Note that the export method in the last line belongs to the class Scrubyt::Pattern
|
52
|
+
#and not to Scrubyt::Export (i.e. this class). Scrubyt::Pattern.export will call
|
53
|
+
#Scrubyt::Export.export.
|
54
|
+
#
|
55
|
+
# camera_data = Scrubyt::Extractor.define do
|
56
|
+
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
57
|
+
#
|
58
|
+
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# camera_data.export(__FILE__, 'my_super_camera_extractor.rb', '/home/peter/stuff/result.xml')
|
62
|
+
#
|
63
|
+
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
64
|
+
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
65
|
+
#'/home/peter/stuff/result.xml'.
|
66
|
+
def self.export(input_file, pattern, output_file_name, extractor_result_file_name)
|
67
|
+
@result = ""
|
68
|
+
contents = open(input_file).read
|
69
|
+
wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)
|
70
|
+
output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
|
71
|
+
open(output_file_name, 'w')
|
72
|
+
export_header(output_file)
|
73
|
+
export_extractor(contents, pattern, output_file)
|
74
|
+
export_footer(output_file, wrapper_name, extractor_result_file_name)
|
75
|
+
cleanup_result
|
76
|
+
output_file.write(@result)
|
77
|
+
output_file.close
|
78
|
+
@result
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
def self.export_header(output_file)
|
83
|
+
@result += "require 'lib/extractor.rb'\n\n"
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.cleanup_result
|
87
|
+
@result.gsub!('P.') {}
|
88
|
+
end
|
89
|
+
|
90
|
+
#OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
|
91
|
+
#Piles of tons of heaps of woodoo magic :-)
|
92
|
+
#
|
93
|
+
#The only reason I can expect it to work is that it passes all the tests of the extractors
|
94
|
+
#I have created so far. However at the same time I know how to create one easily which
|
95
|
+
#would break the exporting, so don't experiment with this too much...
|
96
|
+
#
|
97
|
+
#The other solutions include:
|
98
|
+
#- serialization (yaml, pstore etc) but that would mess the code terribly up - so
|
99
|
+
#therefore I did not chose this solution.
|
100
|
+
#- defining the block as string - however, this introduces ugly %q{}s etc - all in all,
|
101
|
+
#this is still a more viable solution that serialization IMHO
|
102
|
+
#- a lot of other tricks - however, all of these introduce a lot of noise which I don't
|
103
|
+
#like.
|
104
|
+
#
|
105
|
+
#Conclusion: If there will be no terrible, unrepairable, uncontrollable etc. problems
|
106
|
+
#with this approach, it will be replaced (probably with constructing the extractor as
|
107
|
+
#a string). However, until that point, it will stay.
|
108
|
+
def self.export_extractor(contents, pattern, output_file)
|
109
|
+
first_line = contents.scan(/.*Extractor\.define.*/)
|
110
|
+
#During wrapper construction, we count the number of blocks; add one occurrence of
|
111
|
+
#end (to close the block of the extractor definition)
|
112
|
+
count = pattern.root_pattern.block_count + 1
|
113
|
+
#Construct the extractor definition matching regexp based on the number of ends
|
114
|
+
definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
|
115
|
+
#Since the regexp matching the extractor definition was multiline, get the first
|
116
|
+
#line separately and patch it in!
|
117
|
+
rows = definition[0].split("\n")
|
118
|
+
#Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
|
119
|
+
#patterns could be matched very easily from the extractor definition (because they begun
|
120
|
+
#with 'P.'). Now that P has been removed, mimick it!
|
121
|
+
keywords = ['fetch', 'fill_textfield', 'submit', 'end', 'click_link']
|
122
|
+
rows.each do |row|
|
123
|
+
#Do not prepend P. to comments and empty lines
|
124
|
+
next if (row.strip =~ /^#/ || row.strip == '')
|
125
|
+
#Do not prepend P. to any of the reserved keywords
|
126
|
+
jump_to_next = false
|
127
|
+
keywords.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
128
|
+
next if jump_to_next
|
129
|
+
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
130
|
+
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
131
|
+
#Don't forget also the stuff in parentheses!
|
132
|
+
row.gsub!(/\{\s+/) {"{P."}
|
133
|
+
end
|
134
|
+
rows[0] = first_line
|
135
|
+
#@full_definition holds the original definition (at this point, later on it will be
|
136
|
+
#gsub!bed and all)
|
137
|
+
@full_definition = rows.join("\n")
|
138
|
+
#This hash contains all the examples that need to be replaced with their XPath
|
139
|
+
#counterparts;"P.#{name}"
|
140
|
+
#We are relying on the convention that if an example is definied, it is always
|
141
|
+
#the first parameter and it is always a string
|
142
|
+
@name_to_xpath_map = {}
|
143
|
+
create_name_to_xpath_map(pattern)
|
144
|
+
#Replace the examples which are quoted with " and '
|
145
|
+
@name_to_xpath_map.each do |name, xpath|
|
146
|
+
replace_example_with_xpath(name, xpath, %q{"})
|
147
|
+
replace_example_with_xpath(name, xpath, %q{'})
|
148
|
+
end
|
149
|
+
#Finally, add XPaths to pattern which had no example at the beginning (the XPath was
|
150
|
+
#generated from the child patterns
|
151
|
+
@name_to_xpath_map.each do |name, xpath|
|
152
|
+
comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
|
153
|
+
if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
|
154
|
+
@full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
|
155
|
+
else
|
156
|
+
@full_definition.sub!("P.#{name}") {"P.#{name} \"#{xpath}\"#{comma}"}
|
157
|
+
end
|
158
|
+
end
|
159
|
+
@result += @full_definition
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
|
163
|
+
if extractor_result_file_name
|
164
|
+
@result += "\n\n#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
|
165
|
+
else
|
166
|
+
@result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
def self.create_name_to_xpath_map(pattern)
|
172
|
+
@name_to_xpath_map[pattern.name] = pattern.filters[0].xpath if pattern.filters[0].xpath != nil
|
173
|
+
pattern.children.each {|child| create_name_to_xpath_map child}
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.replace_example_with_xpath(name, xpath, left_delimiter, right_delimiter=left_delimiter)
|
177
|
+
replacing_xpath = (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{') ?
|
178
|
+
"P.#{name}('\"#{xpath}\"')" :
|
179
|
+
"P.#{name} \"#{xpath}\""
|
180
|
+
@full_definition.sub!(/P\.#{name}\s+#{left_delimiter}(.*)#{right_delimiter}/) do
|
181
|
+
@name_to_xpath_map.delete("#{name}")
|
182
|
+
replacing_xpath
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|