scrubyt 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of compund examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by compound examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class CompoundExampleLookup
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link)
12
+ @partial_results = []
13
+ self.lookup_compound_example(doc, compound_example)
14
+ end
15
+
16
+ private
17
+ #Lookup the first element which is matched by this compund example
18
+ #
19
+ #A compound example is specified with :contains, :begins_with and
20
+ #:ends_with descriptors - which can be both regexps or strings
21
+ #
22
+ #Example:
23
+ #
24
+ #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
+ def self.lookup_compound_example(doc, compound_example)
26
+ compound_example.each do |k,v|
27
+ v = Regexp.escape(v) if v.is_a? String
28
+ case k
29
+ when :contains
30
+ v = /#{v}/
31
+ when :begins_with
32
+ v = /^\s*#{v}/
33
+ when :ends_with
34
+ v = /#{v}\s*$/
35
+ end
36
+ if (@partial_results.empty?)
37
+ @partial_results = SharedUtils.traverse_for_match(doc, v)
38
+ else
39
+ refine_partial_results(v)
40
+ end
41
+ end
42
+ @partial_results.first
43
+ end
44
+
45
+ def self.refine_partial_results(regexp)
46
+ @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
47
+ end
48
+
49
+ end #End of class CompoundExampleLookup
50
+ end #End of module Scrubyt
@@ -0,0 +1,45 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
+ #
5
+ class SharedUtils
6
+ #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
7
+ NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
8
+
9
+ #Entities to replace - need to make this more complete, or install htmlentities or similar package
10
+ ENTITIES = {
11
+ 'quot' => '"',
12
+ 'apos' => "'",
13
+ 'amp' => '&',
14
+ 'lt' => '<',
15
+ 'gt' => '>',
16
+ 'nbsp' => ' '}
17
+
18
+ #Unescape the entities in the HTML!
19
+ def self.unescape_entities(text)
20
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
21
+ text
22
+ end
23
+
24
+ #Entry point for finding the elements specified by examples
25
+ def self.traverse_for_match(node, regexp)
26
+ @results = []
27
+ traverse_for_match_inner(node,regexp)
28
+ @results
29
+ end
30
+
31
+ private
32
+ def self.traverse_for_match_inner(node, regexp)
33
+ ft = unescape_entities(node.inner_text).strip
34
+ if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
35
+ @results << node
36
+ @results.delete node.parent
37
+ end
38
+ node.children.each do |child|
39
+ if child.instance_of? Hpricot::Elem
40
+ traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
41
+ end
42
+ end
43
+ end #end of method traverse_for_match
44
+ end #end of class SharedUtils
45
+ end #end of module Scrubyt
@@ -0,0 +1,23 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of simple examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by simple examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class SimpleExampleLookup
11
+ #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
12
+ #The text can be also a mixed content text, e.g.
13
+ #
14
+ # <a>Bon <b>nuit</b>, monsieur!</a>
15
+ #
16
+ #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
+ def self.find_node_from_text(doc, text, next_link)
18
+ text.gsub!('»', '&#187;')
19
+ text = Regexp.escape(text) if text.is_a? String
20
+ SharedUtils.traverse_for_match(doc,/#{text}/).first
21
+ end
22
+ end #End of class SimpleExampleLookup
23
+ end #End of module Scrubyt
@@ -4,54 +4,8 @@ require 'hpricot'
4
4
  module Scrubyt
5
5
  ##
6
6
  #=<tt>Various XPath utility functions</tt>
7
- class XPathUtils
8
- #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
- NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
- ENTITIES = {
11
- 'quot' => '"',
12
- 'apos' => "'",
13
- 'amp' => '&',
14
- 'lt' => '<',
15
- 'gt' => '>',
16
- 'nbsp' => ' '}
17
-
18
- #From the example text defined by the user, find the lowest possible node with the text 'text'.
19
- #The text can be also a mixed content text, e.g.
20
- #
21
- # <a>Bon <b>nuit</b>, monsieur!</a>
22
- #
23
- #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
24
- def self.find_node_from_text(doc, text, next_link)
25
- @node = nil
26
- @found = false
27
- #digg next page hack
28
- text.gsub!('»', '&#187;')
29
- self.traverse_for_full_text(doc,text)
30
- self.lowest_possible_node_with_text(@node, text) if @node != nil
31
- if (@found == false)
32
- #Fallback to per node text lookup
33
- self.traverse_for_node_text(doc,text)
34
- if (@found == false)
35
- return nil if next_link
36
- puts "!" * 65
37
- puts "!!!!!! FATAL: Node for example #{text} Not found! !!!!!!"
38
- puts "!!!!!! Please make sure you specified the example properly !!!!!!"
39
- puts "!" * 65
40
- exit
41
- end
42
- end
43
- @node
44
- end
45
-
46
- #Full text of the node; this is equivalent to Hpricot's inner_text
47
- #(? be sure to check). Will be
48
- #replaced if Hpricot 0.5 will be released
49
- def self.full_text(node)
50
- result = ""
51
- node.traverse_text { |t| result += t.to_s }
52
- result
53
- end
54
-
7
+ class XPathUtils
8
+
55
9
  #Find the LCA (Lowest Common Ancestor) of two nodes
56
10
  def self.lowest_common_ancestor(node1, node2)
57
11
  path1 = traverse_up(node1)
@@ -71,7 +25,7 @@ module Scrubyt
71
25
  #
72
26
  #*parameters*
73
27
  #
74
- #_node_ - The node we are looking the XPath for
28
+ #_node_ - The node we are looking up the XPath for
75
29
  #
76
30
  #_stopnode_ - The Xpath generation is stopped and the XPath that
77
31
  #was generated so far is returned if this node is reached.
@@ -154,7 +108,33 @@ module Scrubyt
154
108
  node
155
109
  end
156
110
 
111
+ ##
112
+ #Used when automatically looking up href attributes (for detail or next links)
113
+ #If the detail pattern did not extract a link, we first look up it's
114
+ #children - and if we don't find a link, traverse up
115
+ def self.find_nearest_node_with_attribute(node, attribute)
116
+ @node = nil
117
+ return node if node.is_a? Hpricot::Elem and node[attribute]
118
+ first_child_node_with_attribute(node, attribute)
119
+ first_parent_node_with_attribute(node, attribute) if !@node
120
+ @node
121
+ end
157
122
 
123
+ ##
124
+ #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
125
+ #and a child one. The result of the method is the relative XPath of the node pointed to
126
+ #by the second XPath to the node pointed to by the firs XPath.
127
+ def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
128
+ original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
+ pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
130
+ i = 0
131
+ pairs.each_with_index do |pair,index|
132
+ i = index
133
+ break if pair[0] != pair[1]
134
+ end
135
+ "/" + original_child_xpath_parts[i..-1].join('/')
136
+ end
137
+
158
138
  private
159
139
  #Find the index of the child inside the parent
160
140
  #For example:
@@ -189,50 +169,21 @@ private
189
169
  end
190
170
  path
191
171
  end
192
-
193
- def self.traverse_for_node_text(node, text)
194
- return if @found
195
- if (node.instance_of? Hpricot::Elem)
196
- node.traverse_text do |t|
197
- if (t.to_s == text)
198
- @found = true
199
- @node = t.parent
200
- end
201
- end
202
- end
203
- node.children.each do |child|
204
- if child.instance_of? Hpricot::Elem
205
- traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
206
- end
207
- end
208
- end
209
-
210
- def self.traverse_for_full_text(node, text)
211
- return if @found
212
- if (node.instance_of? Hpricot::Elem)
213
- ft = unescape_entities(full_text(node)).strip
214
- #puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
215
- if (ft == text)
216
- @found = true
217
- @node = node
218
- end
219
- end
220
- node.children.each do |child|
221
- if child.instance_of? Hpricot::Elem
222
- traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
223
- end
224
- end
172
+
173
+ def self.first_child_node_with_attribute(node, attribute)
174
+ return if !node.instance_of? Hpricot::Elem || @node
175
+ @node = node if node.attributes[attribute]
176
+ node.children.each { |child| first_child_node_with_attribute(child, attribute) }
225
177
  end
226
178
 
227
- def self.unescape_entities(text)
228
- ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
229
- text
230
- end
231
-
232
- def self.lowest_possible_node_with_text(node, text)
233
- return if node.instance_of? Hpricot::Text
234
- @node = node if full_text(node) == text
235
- node.children.each { |child| lowest_possible_node_with_text(child, text) }
236
- end #End of method lowest_possible_node_with_text
179
+ def self.first_parent_node_with_attribute(node, attribute)
180
+ return if !node.instance_of? Hpricot::Elem || @node
181
+ @node = node if node.attributes[attribute]
182
+ first_parent_node_with_attribute(node.parent, attribute)
183
+ end
184
+
185
+ def self.to_general_XPath(xpath)
186
+ xpath.gsub(/\[.+?\]/) {""}
187
+ end #End of method to_general_XPath
237
188
  end #End of class XPathUtils
238
189
  end #End of module Scrubyt
@@ -0,0 +1,68 @@
1
+ require 'scrubyt'
2
+ require 'test/unit'
3
+
4
+ class SimpleExampleLookupTest
5
+
6
+ def setup
7
+ doc1 = <<-DOC
8
+ <a>
9
+ <b>
10
+ <c/>
11
+ <d>dddd</d>
12
+ <e>
13
+ <f>fff</f>
14
+ <k>kk</k>
15
+ <j/>
16
+ <l>lll</l>
17
+ <m/>
18
+ <n>nnn</n>
19
+ <n>nnnnnn</n>
20
+ <n>
21
+ nnnnnnnnn
22
+ <q/>
23
+ <r>rrr</r>
24
+ </n>
25
+ <o>ooo</o>
26
+ <n>nnnnnnnnnnnn</n>
27
+ <p>ppp</p>
28
+ </e>
29
+ </b>
30
+ <g>ggg</g>
31
+ </a>
32
+ DOC
33
+ @doc1 = Hpricot(doc1)
34
+ @a = @doc1.children[1]
35
+ @b = @a.children[1]
36
+ @c = @b.children[1]
37
+ @d = @b.children[3]
38
+ @e = @b.children[5]
39
+ @f = @e.children[1]
40
+ @g = @a.children[@a.children.size-2]
41
+ @k = @e.children[3]
42
+ @j = @e.children[5]
43
+ @l = @e.children[7]
44
+ @m = @e.children[9]
45
+ @n_1 = @e.children[11]
46
+ @n_2 = @e.children[13]
47
+ @n_3 = @e.children[15]
48
+ @o = @e.children[17]
49
+ @n_4 = @e.children[19]
50
+ @p = @e.children[21]
51
+ @q = @n_3.children[1]
52
+ @r = @n_3.children[3]
53
+ #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
54
+ end
55
+
56
+ def test_find_node_from_text
57
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
58
+ assert_instance_of(Hpricot::Elem, elem)
59
+ assert_equal(elem, @f)
60
+
61
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
62
+ assert_equal(elem, @d)
63
+
64
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
65
+ assert_equal(elem, @r)
66
+
67
+ end
68
+ end
@@ -53,20 +53,7 @@ class XPathUtilsTest < Test::Unit::TestCase
53
53
  @r = @n_3.children[3]
54
54
  #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
55
55
  end
56
-
57
- def test_find_node_from_text
58
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
59
- assert_instance_of(Hpricot::Elem, elem)
60
- assert_equal(elem, @f)
61
-
62
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
63
- assert_equal(elem, @d)
64
-
65
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
66
- assert_equal(elem, @r)
67
56
 
68
- end
69
-
70
57
  def test_lowest_common_ancestor
71
58
  lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
72
59
  lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.3
7
- date: 2007-02-20 00:00:00 +01:00
6
+ version: 0.2.6
7
+ date: 2007-03-25 00:00:00 +01:00
8
8
  summary: A powerful Web-scraping framework
9
9
  require_paths:
10
10
  - lib
@@ -34,19 +34,24 @@ files:
34
34
  - CHANGELOG
35
35
  - Rakefile
36
36
  - lib/scrubyt.rb
37
+ - lib/scrubyt/utils/shared_utils.rb
37
38
  - lib/scrubyt/utils/xpathutils.rb
39
+ - lib/scrubyt/utils/simple_example_lookup.rb
40
+ - lib/scrubyt/utils/compound_example_lookup.rb
38
41
  - lib/scrubyt/output/result_dumper.rb
39
42
  - lib/scrubyt/output/export.rb
40
43
  - lib/scrubyt/output/post_processor.rb
41
44
  - lib/scrubyt/output/result.rb
42
- - lib/scrubyt/core/navigation/fetch_action.rb
43
45
  - lib/scrubyt/core/navigation/navigation_actions.rb
46
+ - lib/scrubyt/core/navigation/fetch_action.rb
44
47
  - lib/scrubyt/core/scraping/result_indexer.rb
45
48
  - lib/scrubyt/core/scraping/constraint_adder.rb
46
49
  - lib/scrubyt/core/scraping/constraint.rb
47
50
  - lib/scrubyt/core/scraping/filter.rb
48
51
  - lib/scrubyt/core/scraping/pattern.rb
49
52
  - lib/scrubyt/core/scraping/pre_filter_document.rb
53
+ - lib/scrubyt/core/scraping/compound_example.rb
54
+ - lib/scrubyt/core/shared/u_r_i_builder.rb
50
55
  - lib/scrubyt/core/shared/evaluation_context.rb
51
56
  - lib/scrubyt/core/shared/extractor.rb
52
57
  test_files:
@@ -56,6 +61,7 @@ test_files:
56
61
  - test/unittests/extractor_test.rb
57
62
  - test/unittests/xpathutils_test.rb
58
63
  - test/unittests/constraint_test.rb
64
+ - test/unittests/simple_example_lookup_test.rb
59
65
  - test/unittests/input/constraint_test.html
60
66
  - test/unittests/input/test.html
61
67
  rdoc_options: []