scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -11,9 +11,9 @@ class ConstraintTest < Test::Unit::TestCase
|
|
11
11
|
ensure_absence_of_attribute('fill' => 'small_circles')
|
12
12
|
end
|
13
13
|
|
14
|
-
assert_equal(data.children[0].
|
14
|
+
assert_equal(data.children[0].constraints[0].type,
|
15
15
|
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
16
|
-
assert_equal(data.children[0].
|
16
|
+
assert_equal(data.children[0].constraints[1].type,
|
17
17
|
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
18
18
|
end
|
19
19
|
|
@@ -25,9 +25,9 @@ class ConstraintTest < Test::Unit::TestCase
|
|
25
25
|
ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
|
26
26
|
end
|
27
27
|
|
28
|
-
assert_equal(data.children[0].
|
28
|
+
assert_equal(data.children[0].constraints[0].type,
|
29
29
|
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
30
|
-
assert_equal(data.children[0].
|
30
|
+
assert_equal(data.children[0].constraints[1].type,
|
31
31
|
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
32
32
|
end
|
33
33
|
|
@@ -50,14 +50,18 @@ class ConstraintTest < Test::Unit::TestCase
|
|
50
50
|
(shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
|
51
51
|
ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
|
52
52
|
end
|
53
|
-
|
53
|
+
|
54
54
|
data3 = Scrubyt::Extractor.define do
|
55
55
|
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
56
56
|
|
57
|
-
|
58
|
-
|
57
|
+
shape 'line'#.ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
|
58
|
+
# ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
|
59
59
|
end
|
60
60
|
|
61
|
+
p data3.to_xml.to_s
|
62
|
+
exit
|
63
|
+
|
64
|
+
|
61
65
|
data4 = Scrubyt::Extractor.define do
|
62
66
|
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
63
67
|
|
@@ -11,8 +11,8 @@ class ExtractorTest < Test::Unit::TestCase
|
|
11
11
|
|
12
12
|
assert_equal(pattern.name, "root")
|
13
13
|
assert_equal(pattern.children[0].name, 'pattern')
|
14
|
-
assert_equal(pattern.type,
|
15
|
-
assert_equal(pattern.output_type,
|
14
|
+
assert_equal(pattern.type, :root)
|
15
|
+
assert_equal(pattern.output_type, :model)
|
16
16
|
|
17
17
|
assert_equal(pattern.generalize, false)
|
18
18
|
assert_equal(pattern.children[0].generalize, true)
|
@@ -25,12 +25,12 @@ class ExtractorTest < Test::Unit::TestCase
|
|
25
25
|
end
|
26
26
|
|
27
27
|
assert_equal(pattern.name, "root")
|
28
|
-
assert_equal(pattern.type,
|
29
|
-
assert_equal(pattern.output_type,
|
28
|
+
assert_equal(pattern.type, :root)
|
29
|
+
assert_equal(pattern.output_type, :model)
|
30
30
|
|
31
31
|
assert_equal(pattern.children[0].name, "parent")
|
32
|
-
assert_equal(pattern.children[0].type,
|
33
|
-
assert_equal(pattern.children[0].output_type,
|
32
|
+
assert_equal(pattern.children[0].type, :tree)
|
33
|
+
assert_equal(pattern.children[0].output_type, :model)
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_create_more_children
|
@@ -5,75 +5,75 @@ require 'test/unit'
|
|
5
5
|
class FilterTest < Test::Unit::TestCase
|
6
6
|
def test_determine_example_type
|
7
7
|
#Test children example
|
8
|
-
assert_equal(Scrubyt::
|
9
|
-
Scrubyt::
|
8
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type(nil),
|
9
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_CHILDREN)
|
10
10
|
#Test image example
|
11
|
-
assert_equal(Scrubyt::
|
12
|
-
Scrubyt::
|
13
|
-
assert_equal(Scrubyt::
|
14
|
-
Scrubyt::
|
15
|
-
assert_equal(Scrubyt::
|
16
|
-
Scrubyt::
|
17
|
-
assert_equal(Scrubyt::
|
18
|
-
Scrubyt::
|
19
|
-
assert_not_equal(Scrubyt::
|
20
|
-
Scrubyt::
|
21
|
-
assert_not_equal(Scrubyt::
|
22
|
-
Scrubyt::
|
11
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.png'),
|
12
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
13
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.gif'),
|
14
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
15
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.jpg'),
|
16
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
17
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.jpeg'),
|
18
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
19
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.zip'),
|
20
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
21
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.pif'),
|
22
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
|
23
23
|
#Test XPaths
|
24
|
-
assert_equal(Scrubyt::
|
25
|
-
Scrubyt::
|
26
|
-
assert_equal(Scrubyt::
|
27
|
-
Scrubyt::
|
28
|
-
assert_equal(Scrubyt::
|
29
|
-
Scrubyt::
|
30
|
-
assert_equal(Scrubyt::
|
31
|
-
Scrubyt::
|
32
|
-
assert_equal(Scrubyt::
|
33
|
-
Scrubyt::
|
34
|
-
assert_equal(Scrubyt::
|
35
|
-
Scrubyt::
|
36
|
-
assert_equal(Scrubyt::
|
37
|
-
Scrubyt::
|
38
|
-
assert_equal(Scrubyt::
|
39
|
-
Scrubyt::
|
40
|
-
assert_equal(Scrubyt::
|
41
|
-
Scrubyt::
|
42
|
-
assert_equal(Scrubyt::
|
43
|
-
Scrubyt::
|
44
|
-
assert_equal(Scrubyt::
|
45
|
-
Scrubyt::
|
46
|
-
assert_equal(Scrubyt::
|
47
|
-
Scrubyt::
|
48
|
-
assert_equal(Scrubyt::
|
49
|
-
Scrubyt::
|
50
|
-
assert_equal(Scrubyt::
|
51
|
-
Scrubyt::
|
52
|
-
assert_equal(Scrubyt::
|
53
|
-
Scrubyt::
|
54
|
-
assert_equal(Scrubyt::
|
55
|
-
Scrubyt::
|
56
|
-
assert_not_equal(Scrubyt::
|
57
|
-
Scrubyt::
|
58
|
-
assert_not_equal(Scrubyt::
|
59
|
-
Scrubyt::
|
60
|
-
assert_not_equal(Scrubyt::
|
61
|
-
Scrubyt::
|
62
|
-
assert_not_equal(Scrubyt::
|
63
|
-
Scrubyt::
|
24
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/img'),
|
25
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
26
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/h3'),
|
27
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
28
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/h3/a/h2'),
|
29
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
30
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/h2'),
|
31
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
32
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/h1/h3'),
|
33
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
34
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p'),
|
35
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
36
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('//p'),
|
37
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
38
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p//img'),
|
39
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
40
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('//p//img'),
|
41
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
42
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[0]/img'),
|
43
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
44
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[0]'),
|
45
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
46
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('//p[1]'),
|
47
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
48
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[1]//img[2]'),
|
49
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
50
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('//p[1]//img'),
|
51
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
52
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/table/tr/td//span/b'),
|
53
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
54
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('/table[0]//tr/td[1]/span[2]/b'),
|
55
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
56
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('table[0]//tr/td[1]/span[2]/b'),
|
57
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
58
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/table[a]//tr/td[1]/span[2]/b'),
|
59
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
60
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/tab2le[a]//tr/td[1]/span[2]/b'),
|
61
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
62
|
+
assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/table[a]///tr/td[1]/span[2]/b'),
|
63
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
|
64
64
|
#Test string example
|
65
|
-
assert_equal(Scrubyt::
|
66
|
-
Scrubyt::
|
67
|
-
assert_equal(Scrubyt::
|
68
|
-
Scrubyt::
|
69
|
-
assert_equal(Scrubyt::
|
70
|
-
Scrubyt::
|
71
|
-
assert_equal(Scrubyt::
|
72
|
-
Scrubyt::
|
73
|
-
assert_equal(Scrubyt::
|
74
|
-
Scrubyt::
|
75
|
-
assert_equal(Scrubyt::
|
76
|
-
Scrubyt::
|
65
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('Hello, world!'),
|
66
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
67
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('$1022'),
|
68
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
69
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('CANON'),
|
70
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
71
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('This is a string'),
|
72
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
73
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('45'),
|
74
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
75
|
+
assert_equal(Scrubyt::BaseFilter.determine_example_type('td'),
|
76
|
+
Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
|
77
77
|
|
78
78
|
end
|
79
79
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrubyt
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.8
|
7
|
+
date: 2007-04-12 00:00:00 +02:00
|
8
8
|
summary: A powerful Web-scraping framework
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,36 +34,43 @@ files:
|
|
34
34
|
- CHANGELOG
|
35
35
|
- Rakefile
|
36
36
|
- lib/scrubyt.rb
|
37
|
-
- lib/scrubyt/utils/shared_utils.rb
|
38
|
-
- lib/scrubyt/utils/xpathutils.rb
|
39
|
-
- lib/scrubyt/utils/simple_example_lookup.rb
|
40
|
-
- lib/scrubyt/utils/compound_example_lookup.rb
|
41
37
|
- lib/scrubyt/output/result_dumper.rb
|
38
|
+
- lib/scrubyt/output/result.rb
|
42
39
|
- lib/scrubyt/output/export.rb
|
43
40
|
- lib/scrubyt/output/post_processor.rb
|
44
|
-
- lib/scrubyt/
|
41
|
+
- lib/scrubyt/utils/compound_example_lookup.rb
|
42
|
+
- lib/scrubyt/utils/simple_example_lookup.rb
|
43
|
+
- lib/scrubyt/utils/ruby_extensions.rb
|
44
|
+
- lib/scrubyt/utils/xpathutils.rb
|
45
|
+
- lib/scrubyt/utils/shared_utils.rb
|
45
46
|
- lib/scrubyt/core/navigation/navigation_actions.rb
|
46
47
|
- lib/scrubyt/core/navigation/fetch_action.rb
|
47
|
-
- lib/scrubyt/core/scraping/result_indexer.rb
|
48
|
-
- lib/scrubyt/core/scraping/constraint_adder.rb
|
49
48
|
- lib/scrubyt/core/scraping/constraint.rb
|
50
|
-
- lib/scrubyt/core/scraping/filter.rb
|
51
49
|
- lib/scrubyt/core/scraping/pattern.rb
|
52
50
|
- lib/scrubyt/core/scraping/pre_filter_document.rb
|
53
51
|
- lib/scrubyt/core/scraping/compound_example.rb
|
52
|
+
- lib/scrubyt/core/scraping/constraint_adder.rb
|
53
|
+
- lib/scrubyt/core/scraping/result_indexer.rb
|
54
|
+
- lib/scrubyt/core/scraping/filters/attribute_filter.rb
|
55
|
+
- lib/scrubyt/core/scraping/filters/base_filter.rb
|
56
|
+
- lib/scrubyt/core/scraping/filters/regexp_filter.rb
|
57
|
+
- lib/scrubyt/core/scraping/filters/tree_filter.rb
|
58
|
+
- lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
|
59
|
+
- lib/scrubyt/core/scraping/filters/detail_page_filter.rb
|
60
|
+
- lib/scrubyt/core/scraping/filters/download_filter.rb
|
54
61
|
- lib/scrubyt/core/shared/u_r_i_builder.rb
|
55
|
-
- lib/scrubyt/core/shared/evaluation_context.rb
|
56
62
|
- lib/scrubyt/core/shared/extractor.rb
|
63
|
+
- lib/scrubyt/core/shared/evaluation_context.rb
|
57
64
|
test_files:
|
58
65
|
- test/unittests/input
|
66
|
+
- test/unittests/constraint_test.rb
|
59
67
|
- test/unittests/filter_test.rb
|
60
|
-
- test/unittests/pattern_test.rb
|
61
|
-
- test/unittests/extractor_test.rb
|
62
68
|
- test/unittests/xpathutils_test.rb
|
63
|
-
- test/unittests/
|
69
|
+
- test/unittests/extractor_test.rb
|
70
|
+
- test/unittests/pattern_test.rb
|
64
71
|
- test/unittests/simple_example_lookup_test.rb
|
65
|
-
- test/unittests/input/constraint_test.html
|
66
72
|
- test/unittests/input/test.html
|
73
|
+
- test/unittests/input/constraint_test.html
|
67
74
|
rdoc_options: []
|
68
75
|
|
69
76
|
extra_rdoc_files: []
|
@@ -1,201 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
##
|
3
|
-
#=<tt>Filter out relevant pieces from the parent pattern</tt>
|
4
|
-
#
|
5
|
-
#A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
|
6
|
-
#it reaches the bottom. The biggest difference is that instead of water, a HTML
|
7
|
-
#document travels through the space.
|
8
|
-
#
|
9
|
-
#Of course Scrubyt would not make much sense if the same document would arrive at
|
10
|
-
#the bottom that was poured in at the top - since in this case we might use an
|
11
|
-
#indentity transformation (i.e. do nothing with the input) as well.
|
12
|
-
#
|
13
|
-
#This is where filters came in: as they name says, they filter the stuff that is
|
14
|
-
#pouring from above, to leave the interesting parts and discard the rest.
|
15
|
-
#The working of a filter will be explained most easily by the help of an example.
|
16
|
-
#Let's consider that we would like to extract information from a webshop; Concretely
|
17
|
-
#we are interested in the name of the items and the URL pointing to the image of the
|
18
|
-
#item.
|
19
|
-
#
|
20
|
-
#To accomplish this, first we select the items with the pattern item (a pattern is
|
21
|
-
#a logical grouping of fillters; see Pattern documentation) Then our new
|
22
|
-
#context is the result extracted by the 'item' pattern; For every 'item' pattern, further
|
23
|
-
#extract the name and the image of the item; and finally, extract the href attribute
|
24
|
-
#of the image. Let's see an illustration:
|
25
|
-
#
|
26
|
-
# root --> This pattern is called a 'root pattern', It is invisible to you
|
27
|
-
# | and basically it represents the document; it has no filters
|
28
|
-
# +-- item --> Filter what's coming from above (the whole document) to get
|
29
|
-
# | relevant pieces of data (in this case webshop items)
|
30
|
-
# +-- name --> Again, filter what's coming from above (a webshop item) and
|
31
|
-
# | leave only item names after this operation
|
32
|
-
# +-- image --> This time filter the image of the item
|
33
|
-
# |
|
34
|
-
# +-- href --> And finally, from the image elements, get the attribute 'href'
|
35
|
-
class Filter
|
36
|
-
#Type of the example this filter is extracted with
|
37
|
-
|
38
|
-
#XPath example, like html/body/tr/td[1] etc.
|
39
|
-
EXAMPLE_TYPE_XPATH = 0
|
40
|
-
#String from the document, for example 'Canon EOS 300 D'.
|
41
|
-
EXAMPLE_TYPE_STRING = 1
|
42
|
-
#Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
|
43
|
-
EXAMPLE_TYPE_IMAGE = 2
|
44
|
-
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
|
-
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
-
#Regexp example, like /\d+@*\d+[a-z]/
|
47
|
-
EXAMPLE_TYPE_REGEXP = 4
|
48
|
-
#Compound example, like :contains => 'goodies'
|
49
|
-
EXAMPLE_TYPE_COMPOUND = 5
|
50
|
-
|
51
|
-
attr_accessor :example_type, :parent_pattern, :temp_sink,
|
52
|
-
:constraints, :xpath, :regexp, :example, :source, :sink
|
53
|
-
|
54
|
-
def initialize(parent_pattern, example=nil, *args)
|
55
|
-
@parent_pattern = parent_pattern
|
56
|
-
#If the example type is not explicitly defined in the pattern definition,
|
57
|
-
#try to determine it automatically from the example
|
58
|
-
#@example_type = (args[0] == nil ? Filter.determine_example_type(example) :
|
59
|
-
# args[0][:example_type])
|
60
|
-
#TODOOOOO correct this!
|
61
|
-
@example_type = Filter.determine_example_type(example)
|
62
|
-
@sink = [] #output of a filter
|
63
|
-
@source = [] #input of a filter
|
64
|
-
@example = example
|
65
|
-
@xpath = nil #The xpath to evaluate this filter
|
66
|
-
#temp sinks are used for the initial run when determining the XPaths for examples;
|
67
|
-
#@temp_sink = nil
|
68
|
-
@constraints = [] #list of constraints
|
69
|
-
end
|
70
|
-
|
71
|
-
#Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
|
72
|
-
#is evaluated, every pattern evaluates its filters and then they are calling this method
|
73
|
-
def evaluate(source)
|
74
|
-
case @parent_pattern.type
|
75
|
-
when Scrubyt::Pattern::PATTERN_TYPE_TREE
|
76
|
-
result = source/@xpath
|
77
|
-
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
78
|
-
result.class == Hpricot::Elements ? result.map : [result]
|
79
|
-
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
80
|
-
puts "Evaluating: #{@parent_pattern.name}"
|
81
|
-
attribute_value = [source.attributes[@example]]
|
82
|
-
return attribute_value if attribute_value[0]
|
83
|
-
@@attribute_in_parent = nil
|
84
|
-
Filter.traverse_up_until_attribute_found(source.parent, @example)
|
85
|
-
@@attribute_in_parent
|
86
|
-
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
87
|
-
source.inner_text.scan(@example).flatten
|
88
|
-
when Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
89
|
-
#p @parent_pattern.name
|
90
|
-
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
91
|
-
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
92
|
-
@parent_pattern)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
#For all the tree patterns, generate an XPath based on the given example
|
97
|
-
#Also this method should not be called directly; It is automatically called for every tree
|
98
|
-
#pattern directly after wrapper definition
|
99
|
-
def generate_XPath_for_example(next_page_example=false)
|
100
|
-
#puts "generating example for: #{@parent_pattern.name}"
|
101
|
-
#puts @example_type
|
102
|
-
case @example_type
|
103
|
-
when EXAMPLE_TYPE_XPATH
|
104
|
-
@xpath = @example
|
105
|
-
when EXAMPLE_TYPE_STRING
|
106
|
-
@temp_sink = SimpleExampleLookup.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
107
|
-
@example,
|
108
|
-
next_page_example )
|
109
|
-
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
110
|
-
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
111
|
-
when EXAMPLE_TYPE_CHILDREN
|
112
|
-
current_example_index = 0
|
113
|
-
loop do
|
114
|
-
all_child_temp_sinks = []
|
115
|
-
@parent_pattern.children.each do |child_pattern|
|
116
|
-
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
|
117
|
-
end
|
118
|
-
result = all_child_temp_sinks.pop
|
119
|
-
if all_child_temp_sinks.empty?
|
120
|
-
result = result.parent
|
121
|
-
else
|
122
|
-
all_child_temp_sinks.each do |child_sink|
|
123
|
-
result = XPathUtils.lowest_common_ancestor(result, child_sink)
|
124
|
-
end
|
125
|
-
end
|
126
|
-
xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
|
127
|
-
XPathUtils.generate_XPath(result, nil, true)
|
128
|
-
if @parent_pattern.filters.size < current_example_index + 1
|
129
|
-
@parent_pattern.filters << Scrubyt::Filter.new(@parent_pattern)
|
130
|
-
end
|
131
|
-
@parent_pattern.filters[current_example_index].xpath = xpath
|
132
|
-
@parent_pattern.filters[current_example_index].temp_sink = result
|
133
|
-
@parent_pattern.children.each do |child_pattern|
|
134
|
-
next if child_pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
135
|
-
child_pattern.filters[current_example_index].xpath =
|
136
|
-
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
137
|
-
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
138
|
-
end
|
139
|
-
if @parent_pattern.children[0].examples == nil
|
140
|
-
break if @parent_pattern.children[0].filters.size == current_example_index+1
|
141
|
-
else
|
142
|
-
break if @parent_pattern.children[0].examples.size == current_example_index+1
|
143
|
-
end
|
144
|
-
current_example_index += 1
|
145
|
-
end
|
146
|
-
when EXAMPLE_TYPE_IMAGE
|
147
|
-
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
|
148
|
-
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
149
|
-
when EXAMPLE_TYPE_COMPOUND
|
150
|
-
@temp_sink = CompoundExampleLookup.find_node_from_compund_example( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
151
|
-
@example,
|
152
|
-
next_page_example )
|
153
|
-
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
154
|
-
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
def setup_relative_XPaths
|
159
|
-
return if !@parent_pattern.parent.parent
|
160
|
-
parent_filter = @parent_pattern.parent.filters[@parent_pattern.filters.index(self)]
|
161
|
-
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_filter.xpath, @xpath) if (@xpath =~ /^\/html/)
|
162
|
-
end
|
163
|
-
|
164
|
-
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
165
|
-
#should not be called directly
|
166
|
-
def method_missing(method_name, *args, &block)
|
167
|
-
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
168
|
-
end
|
169
|
-
|
170
|
-
private
|
171
|
-
def self.traverse_up_until_attribute_found(source, attribute)
|
172
|
-
if (!source.parent.is_a? Hpricot::Doc)
|
173
|
-
#p source.attributes
|
174
|
-
#p attribute
|
175
|
-
#p source.attributes[attribute]
|
176
|
-
@@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
|
177
|
-
traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
def self.determine_example_type(example)
|
182
|
-
if example.instance_of? Regexp
|
183
|
-
EXAMPLE_TYPE_REGEXP
|
184
|
-
elsif example.instance_of? Hash
|
185
|
-
EXAMPLE_TYPE_COMPOUND
|
186
|
-
else
|
187
|
-
case example
|
188
|
-
when nil
|
189
|
-
EXAMPLE_TYPE_CHILDREN
|
190
|
-
when /\.(jpg|png|gif|jpeg)$/
|
191
|
-
EXAMPLE_TYPE_IMAGE
|
192
|
-
when
|
193
|
-
/^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
|
194
|
-
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
195
|
-
else
|
196
|
-
EXAMPLE_TYPE_STRING
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end #End of method determine_example_type
|
200
|
-
end #End of class
|
201
|
-
end #End of module
|