scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -4,27 +4,27 @@ module Scrubyt
4
4
  #the simple example and the compound example.
5
5
  #
6
6
  #This class is responsible for finding elements matched by compound examples.
7
- #In the futre probably more sophisticated matching algorithms will be added
8
- #(e.g. match the n-th which matches the text, or element that matches the
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
9
  #text but also contains a specific attribute etc.)
10
10
  class CompoundExampleLookup
11
- def self.find_node_from_compund_example(doc, compound_example, next_link)
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
12
12
  @partial_results = []
13
- self.lookup_compound_example(doc, compound_example)
13
+ self.lookup_compound_example(doc, compound_example, index)
14
14
  end
15
-
15
+
16
16
  private
17
17
  #Lookup the first element which is matched by this compund example
18
18
  #
19
- #A compound example is specified with :contains, :begins_with and
19
+ #A compound example is specified with :contains, :begins_with and
20
20
  #:ends_with descriptors - which can be both regexps or strings
21
21
  #
22
22
  #Example:
23
23
  #
24
24
  #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
- def self.lookup_compound_example(doc, compound_example)
25
+ def self.lookup_compound_example(doc, compound_example, index)
26
26
  compound_example.each do |k,v|
27
- v = Regexp.escape(v) if v.is_a? String
27
+ v = Regexp.escape(v) if v.is_a? String
28
28
  case k
29
29
  when :contains
30
30
  v = /#{v}/
@@ -39,12 +39,12 @@ private
39
39
  refine_partial_results(v)
40
40
  end
41
41
  end
42
- @partial_results.first
42
+ @partial_results[index]
43
43
  end
44
-
44
+
45
45
  def self.refine_partial_results(regexp)
46
46
  @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
47
47
  end
48
-
48
+
49
49
  end #End of class CompoundExampleLookup
50
50
  end #End of module Scrubyt
@@ -0,0 +1,113 @@
1
+ class Module
2
+ def option_reader(key_default_hash)
3
+ key_default_hash.each do |key, default|
4
+ define_method(key) {
5
+ if @options[key].nil?
6
+ if default.is_a? Proc
7
+ instance_eval(&default)
8
+ else
9
+ default
10
+ end
11
+ else
12
+ @options[key]
13
+ end
14
+ }
15
+ end
16
+ end
17
+
18
+ def option_writer(*keys)
19
+ keys.each do |key|
20
+ define_method("#{key.to_s}=".to_sym) { |value|
21
+ @options[key] = value
22
+ }
23
+ end
24
+ end
25
+
26
+ def option(key, default=nil, writable=false)
27
+ option_reader(key => default)
28
+ option_writer(key) if writable
29
+ end
30
+
31
+ def option_accessor(key_default_hash)
32
+ key_default_hash.each do |key, default|
33
+ option(key, default, true)
34
+ end
35
+ end
36
+ end
37
+
38
+ class Range
39
+ def <=>(other)
40
+ self.begin <=> other.begin
41
+ end
42
+
43
+ def +(amount)
44
+ (self.begin + amount)..(self.end + amount)
45
+ end
46
+
47
+ def -(amount)
48
+ (self.begin - amount)..(self.end - amount)
49
+ end
50
+ end
51
+
52
+ module Math
53
+ def self.min(a, b)
54
+ a < b ? a : b
55
+ end
56
+
57
+ def self.max(a, b)
58
+ a > b ? a : b
59
+ end
60
+ end
61
+
62
+ class Array
63
+ def to_sexp
64
+ [:array, *to_sexp_array]
65
+ end
66
+
67
+ def to_sexp_array
68
+ collect { |element| element.to_sexp }
69
+ end
70
+ end
71
+
72
+ class Hash
73
+ def to_sexp
74
+ [:hash, *to_sexp_array]
75
+ end
76
+
77
+ def to_sexp_array
78
+ sexp = []
79
+ each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
80
+ sexp
81
+ end
82
+ end
83
+
84
+ class Symbol
85
+ def to_sexp
86
+ [:lit, self]
87
+ end
88
+ end
89
+
90
+ class String
91
+ def to_sexp
92
+ [:str, self]
93
+ end
94
+ end
95
+
96
+ class TrueClass
97
+ def to_sexp
98
+ [:true]
99
+ end
100
+ end
101
+
102
+ class FalseClass
103
+ def to_sexp
104
+ [:false]
105
+ end
106
+ end
107
+
108
+ class Proc
109
+ alias_method :parse_tree_to_sexp, :to_sexp
110
+ def to_sexp
111
+ [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
112
+ end
113
+ end
@@ -1,11 +1,8 @@
1
1
  module Scrubyt
2
2
  ##
3
- #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
- #
3
+ #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
+ #
5
5
  class SharedUtils
6
- #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
7
- NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
8
-
9
6
  #Entities to replace - need to make this more complete, or install htmlentities or similar package
10
7
  ENTITIES = {
11
8
  'quot' => '"',
@@ -14,32 +11,48 @@ module Scrubyt
14
11
  'lt' => '<',
15
12
  'gt' => '>',
16
13
  'nbsp' => ' '}
17
-
14
+
15
+ def self.prepare_text_for_comparison(text)
16
+ unescape_entities text
17
+ text.strip!
18
+ text
19
+ end
20
+
18
21
  #Unescape the entities in the HTML!
19
22
  def self.unescape_entities(text)
20
- ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
21
- text
22
- end
23
-
23
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
24
+ text
25
+ end
26
+
24
27
  #Entry point for finding the elements specified by examples
25
28
  def self.traverse_for_match(node, regexp)
26
- @results = []
27
- traverse_for_match_inner(node,regexp)
28
- @results
29
+ results = []
30
+ traverse_for_match_inner = lambda { |node, regexp|
31
+ ft = prepare_text_for_comparison(node.inner_text)
32
+ if ft =~ regexp
33
+ node.instance_eval do
34
+ @match_data = $~
35
+ def match_data
36
+ @match_data
37
+ end
38
+ end
39
+ results << node
40
+ results.delete node.parent if node.is_a? Hpricot::Elem
41
+ end
42
+ node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
43
+ }
44
+ traverse_for_match_inner.call(node,regexp)
45
+ results
29
46
  end
30
-
31
- private
32
- def self.traverse_for_match_inner(node, regexp)
33
- ft = unescape_entities(node.inner_text).strip
34
- if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
35
- @results << node
36
- @results.delete node.parent
47
+
48
+ def self.get_backtrace
49
+ begin
50
+ raise
51
+ rescue Exception => ex
52
+ backtrace = ex.backtrace
37
53
  end
38
- node.children.each do |child|
39
- if child.instance_of? Hpricot::Elem
40
- traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
41
- end
42
- end
43
- end #end of method traverse_for_match
54
+ backtrace.slice!(0)
55
+ backtrace
56
+ end
44
57
  end #end of class SharedUtils
45
58
  end #end of module Scrubyt
@@ -4,8 +4,8 @@ module Scrubyt
4
4
  #the simple example and the compound example.
5
5
  #
6
6
  #This class is responsible for finding elements matched by simple examples.
7
- #In the futre probably more sophisticated matching algorithms will be added
8
- #(e.g. match the n-th which matches the text, or element that matches the
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
9
  #text but also contains a specific attribute etc.)
10
10
  class SimpleExampleLookup
11
11
  #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
@@ -14,10 +14,10 @@ module Scrubyt
14
14
  # <a>Bon <b>nuit</b>, monsieur!</a>
15
15
  #
16
16
  #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
- def self.find_node_from_text(doc, text, next_link)
17
+ def self.find_node_from_text(doc, text, next_link=false, index = 0)
18
18
  text.gsub!('»', '&#187;')
19
- text = Regexp.escape(text) if text.is_a? String
20
- SharedUtils.traverse_for_match(doc,/#{text}/).first
21
- end
19
+ text = Regexp.escape(text) if text.is_a? String
20
+ SharedUtils.traverse_for_match(doc,/#{text}/)[index]
21
+ end
22
22
  end #End of class SimpleExampleLookup
23
23
  end #End of module Scrubyt
@@ -4,9 +4,9 @@ require 'hpricot'
4
4
  module Scrubyt
5
5
  ##
6
6
  #=<tt>Various XPath utility functions</tt>
7
- class XPathUtils
8
-
9
- #Find the LCA (Lowest Common Ancestor) of two nodes
7
+ class XPathUtils
8
+
9
+ #Find the LCA (Lowest Common Ancestor) of two nodes
10
10
  def self.lowest_common_ancestor(node1, node2)
11
11
  path1 = traverse_up(node1)
12
12
  path2 = traverse_up(node2)
@@ -19,7 +19,7 @@ module Scrubyt
19
19
  end
20
20
  path1.size > path2.size ? path1.last.parent : path2.last.parent
21
21
  end
22
-
22
+
23
23
  ##
24
24
  #Generate XPath for the given node
25
25
  #
@@ -28,7 +28,7 @@ module Scrubyt
28
28
  #_node_ - The node we are looking up the XPath for
29
29
  #
30
30
  #_stopnode_ - The Xpath generation is stopped and the XPath that
31
- #was generated so far is returned if this node is reached.
31
+ #was generated so far is returned if this node is reached.
32
32
  #
33
33
  #_write_indices_ - whether the index inside the parent shuold be
34
34
  #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
@@ -36,7 +36,7 @@ module Scrubyt
36
36
  path = []
37
37
  indices = []
38
38
  found = false
39
- while node.class != Hpricot::Doc do
39
+ while !node.nil? && node.class != Hpricot::Doc do
40
40
  if node == stopnode
41
41
  found = true
42
42
  break
@@ -53,32 +53,32 @@ module Scrubyt
53
53
  path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
54
54
  else
55
55
  path.reverse.each{ |node| result += "#{node}/" }
56
- end
56
+ end
57
57
  "/" + result.chop
58
58
  end
59
-
60
- #Generate an XPath of the node with indices, relatively to the given
59
+
60
+ #Generate an XPath of the node with indices, relatively to the given
61
61
  #relative_root.
62
62
  #
63
- #For example if the elem's absolute XPath is /a/b/c,
63
+ #For example if the elem's absolute XPath is /a/b/c,
64
64
  #and the relative root's Xpath is a/b, the result of the function will
65
65
  #be /c.
66
66
  def self.generate_relative_XPath( elem,relative_root )
67
67
  return nil if (elem == relative_root)
68
68
  generate_XPath(elem, relative_root, true)
69
69
  end
70
-
71
- #Generate a generalized XPath (i.e. without indices) of the node,
70
+
71
+ #Generate a generalized XPath (i.e. without indices) of the node,
72
72
  #relatively to the given relative_root.
73
73
  #
74
- #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
74
+ #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
75
75
  #and the relative root's Xpath is a[1]/b[3], the result of the function will
76
- #be /c.
76
+ #be /c.
77
77
  def self.generate_generalized_relative_XPath( elem,relative_root )
78
78
  return nil if (elem == relative_root)
79
79
  generate_XPath(elem, relative_root, false)
80
80
  end
81
-
81
+
82
82
  #Find an image based on the src of the example
83
83
  #
84
84
  #*parameters*
@@ -91,7 +91,7 @@ module Scrubyt
91
91
  #and thus can be easily pasted as an examle
92
92
  #
93
93
  #_index_ - there might be more images with the same src on the page -
94
- #most typically the user will need the 0th - but if this is not the
94
+ #most typically the user will need the 0th - but if this is not the
95
95
  #case, there is the possibility to override this
96
96
  def self.find_image(doc, example, index=0)
97
97
  (doc/"//img[@src='#{example}']")[index]
@@ -99,19 +99,20 @@ module Scrubyt
99
99
 
100
100
  ##
101
101
  #Used to find the parent of a node with the given name - for example
102
- #find the <form> node which is the parent of the <input> node
102
+ #find the <form> node which is the parent of the <input> node
103
103
  def self.traverse_up_until_name(node, name)
104
104
  while node.class != Hpricot::Doc do
105
+ raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
105
106
  break if node.name == name
106
107
  node = node.parent
107
108
  end
108
109
  node
109
110
  end
110
-
111
+
111
112
  ##
112
113
  #Used when automatically looking up href attributes (for detail or next links)
113
- #If the detail pattern did not extract a link, we first look up it's
114
- #children - and if we don't find a link, traverse up
114
+ #If the detail pattern did not extract a link, we first look up it's
115
+ #children - and if we don't find a link, traverse up
115
116
  def self.find_nearest_node_with_attribute(node, attribute)
116
117
  @node = nil
117
118
  return node if node.is_a? Hpricot::Elem and node[attribute]
@@ -119,13 +120,13 @@ module Scrubyt
119
120
  first_parent_node_with_attribute(node, attribute) if !@node
120
121
  @node
121
122
  end
122
-
123
+
123
124
  ##
124
125
  #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
125
126
  #and a child one. The result of the method is the relative XPath of the node pointed to
126
127
  #by the second XPath to the node pointed to by the firs XPath.
127
128
  def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
128
- original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
+ original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
130
  pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
130
131
  i = 0
131
132
  pairs.each_with_index do |pair,index|
@@ -134,7 +135,7 @@ module Scrubyt
134
135
  end
135
136
  "/" + original_child_xpath_parts[i..-1].join('/')
136
137
  end
137
-
138
+
138
139
  private
139
140
  #Find the index of the child inside the parent
140
141
  #For example:
@@ -142,7 +143,7 @@ private
142
143
  # tr
143
144
  # / | \
144
145
  # td td td
145
- # 0 1 2
146
+ # 0 1 2
146
147
  #
147
148
  #The last row contains the indices of the td's from the
148
149
  #tow above.
@@ -154,7 +155,7 @@ private
154
155
  node.parent.children.each do |child|
155
156
  if child.class == Hpricot::Elem
156
157
  c += 1 if (child.name == node.name)
157
- break if (node == child)
158
+ break if (node == child)
158
159
  end
159
160
  end
160
161
  c
@@ -169,21 +170,21 @@ private
169
170
  end
170
171
  path
171
172
  end
172
-
173
+
173
174
  def self.first_child_node_with_attribute(node, attribute)
174
175
  return if !node.instance_of? Hpricot::Elem || @node
175
176
  @node = node if node.attributes[attribute]
176
177
  node.children.each { |child| first_child_node_with_attribute(child, attribute) }
177
178
  end
178
-
179
+
179
180
  def self.first_parent_node_with_attribute(node, attribute)
180
181
  return if !node.instance_of? Hpricot::Elem || @node
181
182
  @node = node if node.attributes[attribute]
182
183
  first_parent_node_with_attribute(node.parent, attribute)
183
- end
184
-
184
+ end
185
+
185
186
  def self.to_general_XPath(xpath)
186
187
  xpath.gsub(/\[.+?\]/) {""}
187
- end #End of method to_general_XPath
188
+ end #End of method to_general_XPath
188
189
  end #End of class XPathUtils
189
190
  end #End of module Scrubyt