scrubyt 0.2.6 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -4,27 +4,27 @@ module Scrubyt
4
4
  #the simple example and the compound example.
5
5
  #
6
6
  #This class is responsible for finding elements matched by compound examples.
7
- #In the futre probably more sophisticated matching algorithms will be added
8
- #(e.g. match the n-th which matches the text, or element that matches the
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
9
  #text but also contains a specific attribute etc.)
10
10
  class CompoundExampleLookup
11
- def self.find_node_from_compund_example(doc, compound_example, next_link)
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
12
12
  @partial_results = []
13
- self.lookup_compound_example(doc, compound_example)
13
+ self.lookup_compound_example(doc, compound_example, index)
14
14
  end
15
-
15
+
16
16
  private
17
17
  #Lookup the first element which is matched by this compund example
18
18
  #
19
- #A compound example is specified with :contains, :begins_with and
19
+ #A compound example is specified with :contains, :begins_with and
20
20
  #:ends_with descriptors - which can be both regexps or strings
21
21
  #
22
22
  #Example:
23
23
  #
24
24
  #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
- def self.lookup_compound_example(doc, compound_example)
25
+ def self.lookup_compound_example(doc, compound_example, index)
26
26
  compound_example.each do |k,v|
27
- v = Regexp.escape(v) if v.is_a? String
27
+ v = Regexp.escape(v) if v.is_a? String
28
28
  case k
29
29
  when :contains
30
30
  v = /#{v}/
@@ -39,12 +39,12 @@ private
39
39
  refine_partial_results(v)
40
40
  end
41
41
  end
42
- @partial_results.first
42
+ @partial_results[index]
43
43
  end
44
-
44
+
45
45
  def self.refine_partial_results(regexp)
46
46
  @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
47
47
  end
48
-
48
+
49
49
  end #End of class CompoundExampleLookup
50
50
  end #End of module Scrubyt
@@ -0,0 +1,113 @@
1
+ class Module
2
+ def option_reader(key_default_hash)
3
+ key_default_hash.each do |key, default|
4
+ define_method(key) {
5
+ if @options[key].nil?
6
+ if default.is_a? Proc
7
+ instance_eval(&default)
8
+ else
9
+ default
10
+ end
11
+ else
12
+ @options[key]
13
+ end
14
+ }
15
+ end
16
+ end
17
+
18
+ def option_writer(*keys)
19
+ keys.each do |key|
20
+ define_method("#{key.to_s}=".to_sym) { |value|
21
+ @options[key] = value
22
+ }
23
+ end
24
+ end
25
+
26
+ def option(key, default=nil, writable=false)
27
+ option_reader(key => default)
28
+ option_writer(key) if writable
29
+ end
30
+
31
+ def option_accessor(key_default_hash)
32
+ key_default_hash.each do |key, default|
33
+ option(key, default, true)
34
+ end
35
+ end
36
+ end
37
+
38
+ class Range
39
+ def <=>(other)
40
+ self.begin <=> other.begin
41
+ end
42
+
43
+ def +(amount)
44
+ (self.begin + amount)..(self.end + amount)
45
+ end
46
+
47
+ def -(amount)
48
+ (self.begin - amount)..(self.end - amount)
49
+ end
50
+ end
51
+
52
+ module Math
53
+ def self.min(a, b)
54
+ a < b ? a : b
55
+ end
56
+
57
+ def self.max(a, b)
58
+ a > b ? a : b
59
+ end
60
+ end
61
+
62
+ class Array
63
+ def to_sexp
64
+ [:array, *to_sexp_array]
65
+ end
66
+
67
+ def to_sexp_array
68
+ collect { |element| element.to_sexp }
69
+ end
70
+ end
71
+
72
+ class Hash
73
+ def to_sexp
74
+ [:hash, *to_sexp_array]
75
+ end
76
+
77
+ def to_sexp_array
78
+ sexp = []
79
+ each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
80
+ sexp
81
+ end
82
+ end
83
+
84
+ class Symbol
85
+ def to_sexp
86
+ [:lit, self]
87
+ end
88
+ end
89
+
90
+ class String
91
+ def to_sexp
92
+ [:str, self]
93
+ end
94
+ end
95
+
96
+ class TrueClass
97
+ def to_sexp
98
+ [:true]
99
+ end
100
+ end
101
+
102
+ class FalseClass
103
+ def to_sexp
104
+ [:false]
105
+ end
106
+ end
107
+
108
+ class Proc
109
+ alias_method :parse_tree_to_sexp, :to_sexp
110
+ def to_sexp
111
+ [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
112
+ end
113
+ end
@@ -1,11 +1,8 @@
1
1
  module Scrubyt
2
2
  ##
3
- #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
- #
3
+ #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
+ #
5
5
  class SharedUtils
6
- #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
7
- NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
8
-
9
6
  #Entities to replace - need to make this more complete, or install htmlentities or similar package
10
7
  ENTITIES = {
11
8
  'quot' => '"',
@@ -14,32 +11,48 @@ module Scrubyt
14
11
  'lt' => '<',
15
12
  'gt' => '>',
16
13
  'nbsp' => ' '}
17
-
14
+
15
+ def self.prepare_text_for_comparison(text)
16
+ unescape_entities text
17
+ text.strip!
18
+ text
19
+ end
20
+
18
21
  #Unescape the entities in the HTML!
19
22
  def self.unescape_entities(text)
20
- ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
21
- text
22
- end
23
-
23
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
24
+ text
25
+ end
26
+
24
27
  #Entry point for finding the elements specified by examples
25
28
  def self.traverse_for_match(node, regexp)
26
- @results = []
27
- traverse_for_match_inner(node,regexp)
28
- @results
29
+ results = []
30
+ traverse_for_match_inner = lambda { |node, regexp|
31
+ ft = prepare_text_for_comparison(node.inner_text)
32
+ if ft =~ regexp
33
+ node.instance_eval do
34
+ @match_data = $~
35
+ def match_data
36
+ @match_data
37
+ end
38
+ end
39
+ results << node
40
+ results.delete node.parent if node.is_a? Hpricot::Elem
41
+ end
42
+ node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
43
+ }
44
+ traverse_for_match_inner.call(node,regexp)
45
+ results
29
46
  end
30
-
31
- private
32
- def self.traverse_for_match_inner(node, regexp)
33
- ft = unescape_entities(node.inner_text).strip
34
- if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
35
- @results << node
36
- @results.delete node.parent
47
+
48
+ def self.get_backtrace
49
+ begin
50
+ raise
51
+ rescue Exception => ex
52
+ backtrace = ex.backtrace
37
53
  end
38
- node.children.each do |child|
39
- if child.instance_of? Hpricot::Elem
40
- traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
41
- end
42
- end
43
- end #end of method traverse_for_match
54
+ backtrace.slice!(0)
55
+ backtrace
56
+ end
44
57
  end #end of class SharedUtils
45
58
  end #end of module Scrubyt
@@ -4,8 +4,8 @@ module Scrubyt
4
4
  #the simple example and the compound example.
5
5
  #
6
6
  #This class is responsible for finding elements matched by simple examples.
7
- #In the futre probably more sophisticated matching algorithms will be added
8
- #(e.g. match the n-th which matches the text, or element that matches the
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
9
  #text but also contains a specific attribute etc.)
10
10
  class SimpleExampleLookup
11
11
  #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
@@ -14,10 +14,10 @@ module Scrubyt
14
14
  # <a>Bon <b>nuit</b>, monsieur!</a>
15
15
  #
16
16
  #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
- def self.find_node_from_text(doc, text, next_link)
17
+ def self.find_node_from_text(doc, text, next_link=false, index = 0)
18
18
  text.gsub!('»', '&#187;')
19
- text = Regexp.escape(text) if text.is_a? String
20
- SharedUtils.traverse_for_match(doc,/#{text}/).first
21
- end
19
+ text = Regexp.escape(text) if text.is_a? String
20
+ SharedUtils.traverse_for_match(doc,/#{text}/)[index]
21
+ end
22
22
  end #End of class SimpleExampleLookup
23
23
  end #End of module Scrubyt
@@ -4,9 +4,9 @@ require 'hpricot'
4
4
  module Scrubyt
5
5
  ##
6
6
  #=<tt>Various XPath utility functions</tt>
7
- class XPathUtils
8
-
9
- #Find the LCA (Lowest Common Ancestor) of two nodes
7
+ class XPathUtils
8
+
9
+ #Find the LCA (Lowest Common Ancestor) of two nodes
10
10
  def self.lowest_common_ancestor(node1, node2)
11
11
  path1 = traverse_up(node1)
12
12
  path2 = traverse_up(node2)
@@ -19,7 +19,7 @@ module Scrubyt
19
19
  end
20
20
  path1.size > path2.size ? path1.last.parent : path2.last.parent
21
21
  end
22
-
22
+
23
23
  ##
24
24
  #Generate XPath for the given node
25
25
  #
@@ -28,7 +28,7 @@ module Scrubyt
28
28
  #_node_ - The node we are looking up the XPath for
29
29
  #
30
30
  #_stopnode_ - The Xpath generation is stopped and the XPath that
31
- #was generated so far is returned if this node is reached.
31
+ #was generated so far is returned if this node is reached.
32
32
  #
33
33
  #_write_indices_ - whether the index inside the parent shuold be
34
34
  #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
@@ -36,7 +36,7 @@ module Scrubyt
36
36
  path = []
37
37
  indices = []
38
38
  found = false
39
- while node.class != Hpricot::Doc do
39
+ while !node.nil? && node.class != Hpricot::Doc do
40
40
  if node == stopnode
41
41
  found = true
42
42
  break
@@ -53,32 +53,32 @@ module Scrubyt
53
53
  path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
54
54
  else
55
55
  path.reverse.each{ |node| result += "#{node}/" }
56
- end
56
+ end
57
57
  "/" + result.chop
58
58
  end
59
-
60
- #Generate an XPath of the node with indices, relatively to the given
59
+
60
+ #Generate an XPath of the node with indices, relatively to the given
61
61
  #relative_root.
62
62
  #
63
- #For example if the elem's absolute XPath is /a/b/c,
63
+ #For example if the elem's absolute XPath is /a/b/c,
64
64
  #and the relative root's Xpath is a/b, the result of the function will
65
65
  #be /c.
66
66
  def self.generate_relative_XPath( elem,relative_root )
67
67
  return nil if (elem == relative_root)
68
68
  generate_XPath(elem, relative_root, true)
69
69
  end
70
-
71
- #Generate a generalized XPath (i.e. without indices) of the node,
70
+
71
+ #Generate a generalized XPath (i.e. without indices) of the node,
72
72
  #relatively to the given relative_root.
73
73
  #
74
- #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
74
+ #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
75
75
  #and the relative root's Xpath is a[1]/b[3], the result of the function will
76
- #be /c.
76
+ #be /c.
77
77
  def self.generate_generalized_relative_XPath( elem,relative_root )
78
78
  return nil if (elem == relative_root)
79
79
  generate_XPath(elem, relative_root, false)
80
80
  end
81
-
81
+
82
82
  #Find an image based on the src of the example
83
83
  #
84
84
  #*parameters*
@@ -91,7 +91,7 @@ module Scrubyt
91
91
  #and thus can be easily pasted as an examle
92
92
  #
93
93
  #_index_ - there might be more images with the same src on the page -
94
- #most typically the user will need the 0th - but if this is not the
94
+ #most typically the user will need the 0th - but if this is not the
95
95
  #case, there is the possibility to override this
96
96
  def self.find_image(doc, example, index=0)
97
97
  (doc/"//img[@src='#{example}']")[index]
@@ -99,19 +99,20 @@ module Scrubyt
99
99
 
100
100
  ##
101
101
  #Used to find the parent of a node with the given name - for example
102
- #find the <form> node which is the parent of the <input> node
102
+ #find the <form> node which is the parent of the <input> node
103
103
  def self.traverse_up_until_name(node, name)
104
104
  while node.class != Hpricot::Doc do
105
+ raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
105
106
  break if node.name == name
106
107
  node = node.parent
107
108
  end
108
109
  node
109
110
  end
110
-
111
+
111
112
  ##
112
113
  #Used when automatically looking up href attributes (for detail or next links)
113
- #If the detail pattern did not extract a link, we first look up it's
114
- #children - and if we don't find a link, traverse up
114
+ #If the detail pattern did not extract a link, we first look up it's
115
+ #children - and if we don't find a link, traverse up
115
116
  def self.find_nearest_node_with_attribute(node, attribute)
116
117
  @node = nil
117
118
  return node if node.is_a? Hpricot::Elem and node[attribute]
@@ -119,13 +120,13 @@ module Scrubyt
119
120
  first_parent_node_with_attribute(node, attribute) if !@node
120
121
  @node
121
122
  end
122
-
123
+
123
124
  ##
124
125
  #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
125
126
  #and a child one. The result of the method is the relative XPath of the node pointed to
126
127
  #by the second XPath to the node pointed to by the firs XPath.
127
128
  def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
128
- original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
+ original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
130
  pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
130
131
  i = 0
131
132
  pairs.each_with_index do |pair,index|
@@ -134,7 +135,7 @@ module Scrubyt
134
135
  end
135
136
  "/" + original_child_xpath_parts[i..-1].join('/')
136
137
  end
137
-
138
+
138
139
  private
139
140
  #Find the index of the child inside the parent
140
141
  #For example:
@@ -142,7 +143,7 @@ private
142
143
  # tr
143
144
  # / | \
144
145
  # td td td
145
- # 0 1 2
146
+ # 0 1 2
146
147
  #
147
148
  #The last row contains the indices of the td's from the
148
149
  #tow above.
@@ -154,7 +155,7 @@ private
154
155
  node.parent.children.each do |child|
155
156
  if child.class == Hpricot::Elem
156
157
  c += 1 if (child.name == node.name)
157
- break if (node == child)
158
+ break if (node == child)
158
159
  end
159
160
  end
160
161
  c
@@ -169,21 +170,21 @@ private
169
170
  end
170
171
  path
171
172
  end
172
-
173
+
173
174
  def self.first_child_node_with_attribute(node, attribute)
174
175
  return if !node.instance_of? Hpricot::Elem || @node
175
176
  @node = node if node.attributes[attribute]
176
177
  node.children.each { |child| first_child_node_with_attribute(child, attribute) }
177
178
  end
178
-
179
+
179
180
  def self.first_parent_node_with_attribute(node, attribute)
180
181
  return if !node.instance_of? Hpricot::Elem || @node
181
182
  @node = node if node.attributes[attribute]
182
183
  first_parent_node_with_attribute(node.parent, attribute)
183
- end
184
-
184
+ end
185
+
185
186
  def self.to_general_XPath(xpath)
186
187
  xpath.gsub(/\[.+?\]/) {""}
187
- end #End of method to_general_XPath
188
+ end #End of method to_general_XPath
188
189
  end #End of class XPathUtils
189
190
  end #End of module Scrubyt