scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -11,9 +11,9 @@ class ConstraintTest < Test::Unit::TestCase
11
11
  ensure_absence_of_attribute('fill' => 'small_circles')
12
12
  end
13
13
 
14
- assert_equal(data.children[0].filters[0].constraints[0].type,
14
+ assert_equal(data.children[0].constraints[0].type,
15
15
  Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
16
- assert_equal(data.children[0].filters[0].constraints[1].type,
16
+ assert_equal(data.children[0].constraints[1].type,
17
17
  Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
18
18
  end
19
19
 
@@ -25,9 +25,9 @@ class ConstraintTest < Test::Unit::TestCase
25
25
  ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
26
26
  end
27
27
 
28
- assert_equal(data.children[0].filters[0].constraints[0].type,
28
+ assert_equal(data.children[0].constraints[0].type,
29
29
  Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
30
- assert_equal(data.children[0].filters[0].constraints[1].type,
30
+ assert_equal(data.children[0].constraints[1].type,
31
31
  Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
32
32
  end
33
33
 
@@ -50,14 +50,18 @@ class ConstraintTest < Test::Unit::TestCase
50
50
  (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
51
51
  ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
52
52
  end
53
-
53
+
54
54
  data3 = Scrubyt::Extractor.define do
55
55
  fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
56
56
 
57
- (shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
58
- ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
57
+ shape 'line'#.ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
58
+ # ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
59
59
  end
60
60
 
61
+ p data3.to_xml.to_s
62
+ exit
63
+
64
+
61
65
  data4 = Scrubyt::Extractor.define do
62
66
  fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
63
67
 
@@ -11,8 +11,8 @@ class ExtractorTest < Test::Unit::TestCase
11
11
 
12
12
  assert_equal(pattern.name, "root")
13
13
  assert_equal(pattern.children[0].name, 'pattern')
14
- assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
15
- assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
14
+ assert_equal(pattern.type, :root)
15
+ assert_equal(pattern.output_type, :model)
16
16
 
17
17
  assert_equal(pattern.generalize, false)
18
18
  assert_equal(pattern.children[0].generalize, true)
@@ -25,12 +25,12 @@ class ExtractorTest < Test::Unit::TestCase
25
25
  end
26
26
 
27
27
  assert_equal(pattern.name, "root")
28
- assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
29
- assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
28
+ assert_equal(pattern.type, :root)
29
+ assert_equal(pattern.output_type, :model)
30
30
 
31
31
  assert_equal(pattern.children[0].name, "parent")
32
- assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
33
- assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
32
+ assert_equal(pattern.children[0].type, :tree)
33
+ assert_equal(pattern.children[0].output_type, :model)
34
34
  end
35
35
 
36
36
  def test_create_more_children
@@ -5,75 +5,75 @@ require 'test/unit'
5
5
  class FilterTest < Test::Unit::TestCase
6
6
  def test_determine_example_type
7
7
  #Test children example
8
- assert_equal(Scrubyt::Filter.determine_example_type(nil),
9
- Scrubyt::Filter::EXAMPLE_TYPE_CHILDREN)
8
+ assert_equal(Scrubyt::BaseFilter.determine_example_type(nil),
9
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_CHILDREN)
10
10
  #Test image example
11
- assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.png'),
12
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
13
- assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.gif'),
14
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
15
- assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpg'),
16
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
17
- assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpeg'),
18
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
19
- assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.zip'),
20
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
21
- assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.pif'),
22
- Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
11
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.png'),
12
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
13
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.gif'),
14
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
15
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.jpg'),
16
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
17
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.jpeg'),
18
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
19
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.zip'),
20
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
21
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('scrubyt.pif'),
22
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_IMAGE)
23
23
  #Test XPaths
24
- assert_equal(Scrubyt::Filter.determine_example_type('/p/img'),
25
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
26
- assert_equal(Scrubyt::Filter.determine_example_type('/p/h3'),
27
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
28
- assert_equal(Scrubyt::Filter.determine_example_type('/p/h3/a/h2'),
29
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
30
- assert_equal(Scrubyt::Filter.determine_example_type('/h2'),
31
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
32
- assert_equal(Scrubyt::Filter.determine_example_type('/h1/h3'),
33
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
34
- assert_equal(Scrubyt::Filter.determine_example_type('/p'),
35
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
36
- assert_equal(Scrubyt::Filter.determine_example_type('//p'),
37
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
38
- assert_equal(Scrubyt::Filter.determine_example_type('/p//img'),
39
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
40
- assert_equal(Scrubyt::Filter.determine_example_type('//p//img'),
41
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
42
- assert_equal(Scrubyt::Filter.determine_example_type('/p[0]/img'),
43
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
44
- assert_equal(Scrubyt::Filter.determine_example_type('/p[0]'),
45
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
46
- assert_equal(Scrubyt::Filter.determine_example_type('//p[1]'),
47
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
48
- assert_equal(Scrubyt::Filter.determine_example_type('/p[1]//img[2]'),
49
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
50
- assert_equal(Scrubyt::Filter.determine_example_type('//p[1]//img'),
51
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
52
- assert_equal(Scrubyt::Filter.determine_example_type('/table/tr/td//span/b'),
53
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
54
- assert_equal(Scrubyt::Filter.determine_example_type('/table[0]//tr/td[1]/span[2]/b'),
55
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
56
- assert_not_equal(Scrubyt::Filter.determine_example_type('table[0]//tr/td[1]/span[2]/b'),
57
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
58
- assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]//tr/td[1]/span[2]/b'),
59
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
60
- assert_not_equal(Scrubyt::Filter.determine_example_type('/tab2le[a]//tr/td[1]/span[2]/b'),
61
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
62
- assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]///tr/td[1]/span[2]/b'),
63
- Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
24
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/img'),
25
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
26
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/h3'),
27
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
28
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p/h3/a/h2'),
29
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
30
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/h2'),
31
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
32
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/h1/h3'),
33
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
34
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p'),
35
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
36
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('//p'),
37
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
38
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p//img'),
39
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
40
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('//p//img'),
41
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
42
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[0]/img'),
43
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
44
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[0]'),
45
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
46
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('//p[1]'),
47
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
48
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/p[1]//img[2]'),
49
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
50
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('//p[1]//img'),
51
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
52
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/table/tr/td//span/b'),
53
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
54
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('/table[0]//tr/td[1]/span[2]/b'),
55
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
56
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('table[0]//tr/td[1]/span[2]/b'),
57
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
58
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/table[a]//tr/td[1]/span[2]/b'),
59
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
60
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/tab2le[a]//tr/td[1]/span[2]/b'),
61
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
62
+ assert_not_equal(Scrubyt::BaseFilter.determine_example_type('/table[a]///tr/td[1]/span[2]/b'),
63
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_XPATH)
64
64
  #Test string example
65
- assert_equal(Scrubyt::Filter.determine_example_type('Hello, world!'),
66
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
67
- assert_equal(Scrubyt::Filter.determine_example_type('$1022'),
68
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
69
- assert_equal(Scrubyt::Filter.determine_example_type('CANON'),
70
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
71
- assert_equal(Scrubyt::Filter.determine_example_type('This is a string'),
72
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
73
- assert_equal(Scrubyt::Filter.determine_example_type('45'),
74
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
75
- assert_equal(Scrubyt::Filter.determine_example_type('td'),
76
- Scrubyt::Filter::EXAMPLE_TYPE_STRING)
65
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('Hello, world!'),
66
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
67
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('$1022'),
68
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
69
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('CANON'),
70
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
71
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('This is a string'),
72
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
73
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('45'),
74
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
75
+ assert_equal(Scrubyt::BaseFilter.determine_example_type('td'),
76
+ Scrubyt::BaseFilter::EXAMPLE_TYPE_STRING)
77
77
 
78
78
  end
79
79
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.6
7
- date: 2007-03-25 00:00:00 +01:00
6
+ version: 0.2.8
7
+ date: 2007-04-12 00:00:00 +02:00
8
8
  summary: A powerful Web-scraping framework
9
9
  require_paths:
10
10
  - lib
@@ -34,36 +34,43 @@ files:
34
34
  - CHANGELOG
35
35
  - Rakefile
36
36
  - lib/scrubyt.rb
37
- - lib/scrubyt/utils/shared_utils.rb
38
- - lib/scrubyt/utils/xpathutils.rb
39
- - lib/scrubyt/utils/simple_example_lookup.rb
40
- - lib/scrubyt/utils/compound_example_lookup.rb
41
37
  - lib/scrubyt/output/result_dumper.rb
38
+ - lib/scrubyt/output/result.rb
42
39
  - lib/scrubyt/output/export.rb
43
40
  - lib/scrubyt/output/post_processor.rb
44
- - lib/scrubyt/output/result.rb
41
+ - lib/scrubyt/utils/compound_example_lookup.rb
42
+ - lib/scrubyt/utils/simple_example_lookup.rb
43
+ - lib/scrubyt/utils/ruby_extensions.rb
44
+ - lib/scrubyt/utils/xpathutils.rb
45
+ - lib/scrubyt/utils/shared_utils.rb
45
46
  - lib/scrubyt/core/navigation/navigation_actions.rb
46
47
  - lib/scrubyt/core/navigation/fetch_action.rb
47
- - lib/scrubyt/core/scraping/result_indexer.rb
48
- - lib/scrubyt/core/scraping/constraint_adder.rb
49
48
  - lib/scrubyt/core/scraping/constraint.rb
50
- - lib/scrubyt/core/scraping/filter.rb
51
49
  - lib/scrubyt/core/scraping/pattern.rb
52
50
  - lib/scrubyt/core/scraping/pre_filter_document.rb
53
51
  - lib/scrubyt/core/scraping/compound_example.rb
52
+ - lib/scrubyt/core/scraping/constraint_adder.rb
53
+ - lib/scrubyt/core/scraping/result_indexer.rb
54
+ - lib/scrubyt/core/scraping/filters/attribute_filter.rb
55
+ - lib/scrubyt/core/scraping/filters/base_filter.rb
56
+ - lib/scrubyt/core/scraping/filters/regexp_filter.rb
57
+ - lib/scrubyt/core/scraping/filters/tree_filter.rb
58
+ - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
59
+ - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
60
+ - lib/scrubyt/core/scraping/filters/download_filter.rb
54
61
  - lib/scrubyt/core/shared/u_r_i_builder.rb
55
- - lib/scrubyt/core/shared/evaluation_context.rb
56
62
  - lib/scrubyt/core/shared/extractor.rb
63
+ - lib/scrubyt/core/shared/evaluation_context.rb
57
64
  test_files:
58
65
  - test/unittests/input
66
+ - test/unittests/constraint_test.rb
59
67
  - test/unittests/filter_test.rb
60
- - test/unittests/pattern_test.rb
61
- - test/unittests/extractor_test.rb
62
68
  - test/unittests/xpathutils_test.rb
63
- - test/unittests/constraint_test.rb
69
+ - test/unittests/extractor_test.rb
70
+ - test/unittests/pattern_test.rb
64
71
  - test/unittests/simple_example_lookup_test.rb
65
- - test/unittests/input/constraint_test.html
66
72
  - test/unittests/input/test.html
73
+ - test/unittests/input/constraint_test.html
67
74
  rdoc_options: []
68
75
 
69
76
  extra_rdoc_files: []
@@ -1,201 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
- #
5
- #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
6
- #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
- #document travels through the space.
8
- #
9
- #Of course Scrubyt would not make much sense if the same document would arrive at
10
- #the bottom that was poured in at the top - since in this case we might use an
11
- #indentity transformation (i.e. do nothing with the input) as well.
12
- #
13
- #This is where filters came in: as they name says, they filter the stuff that is
14
- #pouring from above, to leave the interesting parts and discard the rest.
15
- #The working of a filter will be explained most easily by the help of an example.
16
- #Let's consider that we would like to extract information from a webshop; Concretely
17
- #we are interested in the name of the items and the URL pointing to the image of the
18
- #item.
19
- #
20
- #To accomplish this, first we select the items with the pattern item (a pattern is
21
- #a logical grouping of fillters; see Pattern documentation) Then our new
22
- #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
23
- #extract the name and the image of the item; and finally, extract the href attribute
24
- #of the image. Let's see an illustration:
25
- #
26
- # root --> This pattern is called a 'root pattern', It is invisible to you
27
- # | and basically it represents the document; it has no filters
28
- # +-- item --> Filter what's coming from above (the whole document) to get
29
- # | relevant pieces of data (in this case webshop items)
30
- # +-- name --> Again, filter what's coming from above (a webshop item) and
31
- # | leave only item names after this operation
32
- # +-- image --> This time filter the image of the item
33
- # |
34
- # +-- href --> And finally, from the image elements, get the attribute 'href'
35
- class Filter
36
- #Type of the example this filter is extracted with
37
-
38
- #XPath example, like html/body/tr/td[1] etc.
39
- EXAMPLE_TYPE_XPATH = 0
40
- #String from the document, for example 'Canon EOS 300 D'.
41
- EXAMPLE_TYPE_STRING = 1
42
- #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
- EXAMPLE_TYPE_IMAGE = 2
44
- #No example - the actual XPath is determined from the children XPaths (their LCA)
45
- EXAMPLE_TYPE_CHILDREN = 3
46
- #Regexp example, like /\d+@*\d+[a-z]/
47
- EXAMPLE_TYPE_REGEXP = 4
48
- #Compound example, like :contains => 'goodies'
49
- EXAMPLE_TYPE_COMPOUND = 5
50
-
51
- attr_accessor :example_type, :parent_pattern, :temp_sink,
52
- :constraints, :xpath, :regexp, :example, :source, :sink
53
-
54
- def initialize(parent_pattern, example=nil, *args)
55
- @parent_pattern = parent_pattern
56
- #If the example type is not explicitly defined in the pattern definition,
57
- #try to determine it automatically from the example
58
- #@example_type = (args[0] == nil ? Filter.determine_example_type(example) :
59
- # args[0][:example_type])
60
- #TODOOOOO correct this!
61
- @example_type = Filter.determine_example_type(example)
62
- @sink = [] #output of a filter
63
- @source = [] #input of a filter
64
- @example = example
65
- @xpath = nil #The xpath to evaluate this filter
66
- #temp sinks are used for the initial run when determining the XPaths for examples;
67
- #@temp_sink = nil
68
- @constraints = [] #list of constraints
69
- end
70
-
71
- #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
72
- #is evaluated, every pattern evaluates its filters and then they are calling this method
73
- def evaluate(source)
74
- case @parent_pattern.type
75
- when Scrubyt::Pattern::PATTERN_TYPE_TREE
76
- result = source/@xpath
77
- #puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
78
- result.class == Hpricot::Elements ? result.map : [result]
79
- when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
80
- puts "Evaluating: #{@parent_pattern.name}"
81
- attribute_value = [source.attributes[@example]]
82
- return attribute_value if attribute_value[0]
83
- @@attribute_in_parent = nil
84
- Filter.traverse_up_until_attribute_found(source.parent, @example)
85
- @@attribute_in_parent
86
- when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
87
- source.inner_text.scan(@example).flatten
88
- when Scrubyt::Pattern::PATTERN_TYPE_DETAIL
89
- #p @parent_pattern.name
90
- result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
91
- XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
92
- @parent_pattern)
93
- end
94
- end
95
-
96
- #For all the tree patterns, generate an XPath based on the given example
97
- #Also this method should not be called directly; It is automatically called for every tree
98
- #pattern directly after wrapper definition
99
- def generate_XPath_for_example(next_page_example=false)
100
- #puts "generating example for: #{@parent_pattern.name}"
101
- #puts @example_type
102
- case @example_type
103
- when EXAMPLE_TYPE_XPATH
104
- @xpath = @example
105
- when EXAMPLE_TYPE_STRING
106
- @temp_sink = SimpleExampleLookup.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
107
- @example,
108
- next_page_example )
109
- @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
110
- XPathUtils.generate_XPath(@temp_sink, nil, true)
111
- when EXAMPLE_TYPE_CHILDREN
112
- current_example_index = 0
113
- loop do
114
- all_child_temp_sinks = []
115
- @parent_pattern.children.each do |child_pattern|
116
- all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
117
- end
118
- result = all_child_temp_sinks.pop
119
- if all_child_temp_sinks.empty?
120
- result = result.parent
121
- else
122
- all_child_temp_sinks.each do |child_sink|
123
- result = XPathUtils.lowest_common_ancestor(result, child_sink)
124
- end
125
- end
126
- xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
127
- XPathUtils.generate_XPath(result, nil, true)
128
- if @parent_pattern.filters.size < current_example_index + 1
129
- @parent_pattern.filters << Scrubyt::Filter.new(@parent_pattern)
130
- end
131
- @parent_pattern.filters[current_example_index].xpath = xpath
132
- @parent_pattern.filters[current_example_index].temp_sink = result
133
- @parent_pattern.children.each do |child_pattern|
134
- next if child_pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
135
- child_pattern.filters[current_example_index].xpath =
136
- child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
137
- XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
138
- end
139
- if @parent_pattern.children[0].examples == nil
140
- break if @parent_pattern.children[0].filters.size == current_example_index+1
141
- else
142
- break if @parent_pattern.children[0].examples.size == current_example_index+1
143
- end
144
- current_example_index += 1
145
- end
146
- when EXAMPLE_TYPE_IMAGE
147
- @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
148
- @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
149
- when EXAMPLE_TYPE_COMPOUND
150
- @temp_sink = CompoundExampleLookup.find_node_from_compund_example( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
151
- @example,
152
- next_page_example )
153
- @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
154
- XPathUtils.generate_XPath(@temp_sink, nil, true)
155
- end
156
- end
157
-
158
- def setup_relative_XPaths
159
- return if !@parent_pattern.parent.parent
160
- parent_filter = @parent_pattern.parent.filters[@parent_pattern.filters.index(self)]
161
- @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_filter.xpath, @xpath) if (@xpath =~ /^\/html/)
162
- end
163
-
164
- #Dispatcher method to add constraints; of course, as with any method_missing, this method
165
- #should not be called directly
166
- def method_missing(method_name, *args, &block)
167
- constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
168
- end
169
-
170
- private
171
- def self.traverse_up_until_attribute_found(source, attribute)
172
- if (!source.parent.is_a? Hpricot::Doc)
173
- #p source.attributes
174
- #p attribute
175
- #p source.attributes[attribute]
176
- @@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
177
- traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
178
- end
179
- end
180
-
181
- def self.determine_example_type(example)
182
- if example.instance_of? Regexp
183
- EXAMPLE_TYPE_REGEXP
184
- elsif example.instance_of? Hash
185
- EXAMPLE_TYPE_COMPOUND
186
- else
187
- case example
188
- when nil
189
- EXAMPLE_TYPE_CHILDREN
190
- when /\.(jpg|png|gif|jpeg)$/
191
- EXAMPLE_TYPE_IMAGE
192
- when
193
- /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
194
- (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
195
- else
196
- EXAMPLE_TYPE_STRING
197
- end
198
- end
199
- end #End of method determine_example_type
200
- end #End of class
201
- end #End of module