scrubyt 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of compund examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by compound examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class CompoundExampleLookup
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link)
12
+ @partial_results = []
13
+ self.lookup_compound_example(doc, compound_example)
14
+ end
15
+
16
+ private
17
+ #Lookup the first element which is matched by this compund example
18
+ #
19
+ #A compound example is specified with :contains, :begins_with and
20
+ #:ends_with descriptors - which can be both regexps or strings
21
+ #
22
+ #Example:
23
+ #
24
+ #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
+ def self.lookup_compound_example(doc, compound_example)
26
+ compound_example.each do |k,v|
27
+ v = Regexp.escape(v) if v.is_a? String
28
+ case k
29
+ when :contains
30
+ v = /#{v}/
31
+ when :begins_with
32
+ v = /^\s*#{v}/
33
+ when :ends_with
34
+ v = /#{v}\s*$/
35
+ end
36
+ if (@partial_results.empty?)
37
+ @partial_results = SharedUtils.traverse_for_match(doc, v)
38
+ else
39
+ refine_partial_results(v)
40
+ end
41
+ end
42
+ @partial_results.first
43
+ end
44
+
45
+ def self.refine_partial_results(regexp)
46
+ @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
47
+ end
48
+
49
+ end #End of class CompoundExampleLookup
50
+ end #End of module Scrubyt
@@ -0,0 +1,45 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
+ #
5
+ class SharedUtils
6
+ #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
7
+ NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
8
+
9
+ #Entities to replace - need to make this more complete, or install htmlentities or similar package
10
+ ENTITIES = {
11
+ 'quot' => '"',
12
+ 'apos' => "'",
13
+ 'amp' => '&',
14
+ 'lt' => '<',
15
+ 'gt' => '>',
16
+ 'nbsp' => ' '}
17
+
18
+ #Unescape the entities in the HTML!
19
+ def self.unescape_entities(text)
20
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
21
+ text
22
+ end
23
+
24
+ #Entry point for finding the elements specified by examples
25
+ def self.traverse_for_match(node, regexp)
26
+ @results = []
27
+ traverse_for_match_inner(node,regexp)
28
+ @results
29
+ end
30
+
31
+ private
32
+ def self.traverse_for_match_inner(node, regexp)
33
+ ft = unescape_entities(node.inner_text).strip
34
+ if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
35
+ @results << node
36
+ @results.delete node.parent
37
+ end
38
+ node.children.each do |child|
39
+ if child.instance_of? Hpricot::Elem
40
+ traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
41
+ end
42
+ end
43
+ end #end of method traverse_for_match
44
+ end #end of class SharedUtils
45
+ end #end of module Scrubyt
@@ -0,0 +1,23 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of simple examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by simple examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class SimpleExampleLookup
11
+ #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
12
+ #The text can be also a mixed content text, e.g.
13
+ #
14
+ # <a>Bon <b>nuit</b>, monsieur!</a>
15
+ #
16
+ #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
+ def self.find_node_from_text(doc, text, next_link)
18
+ text.gsub!('»', '&#187;')
19
+ text = Regexp.escape(text) if text.is_a? String
20
+ SharedUtils.traverse_for_match(doc,/#{text}/).first
21
+ end
22
+ end #End of class SimpleExampleLookup
23
+ end #End of module Scrubyt
@@ -4,54 +4,8 @@ require 'hpricot'
4
4
  module Scrubyt
5
5
  ##
6
6
  #=<tt>Various XPath utility functions</tt>
7
- class XPathUtils
8
- #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
- NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
- ENTITIES = {
11
- 'quot' => '"',
12
- 'apos' => "'",
13
- 'amp' => '&',
14
- 'lt' => '<',
15
- 'gt' => '>',
16
- 'nbsp' => ' '}
17
-
18
- #From the example text defined by the user, find the lowest possible node with the text 'text'.
19
- #The text can be also a mixed content text, e.g.
20
- #
21
- # <a>Bon <b>nuit</b>, monsieur!</a>
22
- #
23
- #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
24
- def self.find_node_from_text(doc, text, next_link)
25
- @node = nil
26
- @found = false
27
- #digg next page hack
28
- text.gsub!('»', '&#187;')
29
- self.traverse_for_full_text(doc,text)
30
- self.lowest_possible_node_with_text(@node, text) if @node != nil
31
- if (@found == false)
32
- #Fallback to per node text lookup
33
- self.traverse_for_node_text(doc,text)
34
- if (@found == false)
35
- return nil if next_link
36
- puts "!" * 65
37
- puts "!!!!!! FATAL: Node for example #{text} Not found! !!!!!!"
38
- puts "!!!!!! Please make sure you specified the example properly !!!!!!"
39
- puts "!" * 65
40
- exit
41
- end
42
- end
43
- @node
44
- end
45
-
46
- #Full text of the node; this is equivalent to Hpricot's inner_text
47
- #(? be sure to check). Will be
48
- #replaced if Hpricot 0.5 will be released
49
- def self.full_text(node)
50
- result = ""
51
- node.traverse_text { |t| result += t.to_s }
52
- result
53
- end
54
-
7
+ class XPathUtils
8
+
55
9
  #Find the LCA (Lowest Common Ancestor) of two nodes
56
10
  def self.lowest_common_ancestor(node1, node2)
57
11
  path1 = traverse_up(node1)
@@ -71,7 +25,7 @@ module Scrubyt
71
25
  #
72
26
  #*parameters*
73
27
  #
74
- #_node_ - The node we are looking the XPath for
28
+ #_node_ - The node we are looking up the XPath for
75
29
  #
76
30
  #_stopnode_ - The Xpath generation is stopped and the XPath that
77
31
  #was generated so far is returned if this node is reached.
@@ -154,7 +108,33 @@ module Scrubyt
154
108
  node
155
109
  end
156
110
 
111
+ ##
112
+ #Used when automatically looking up href attributes (for detail or next links)
113
+ #If the detail pattern did not extract a link, we first look up it's
114
+ #children - and if we don't find a link, traverse up
115
+ def self.find_nearest_node_with_attribute(node, attribute)
116
+ @node = nil
117
+ return node if node.is_a? Hpricot::Elem and node[attribute]
118
+ first_child_node_with_attribute(node, attribute)
119
+ first_parent_node_with_attribute(node, attribute) if !@node
120
+ @node
121
+ end
157
122
 
123
+ ##
124
+ #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
125
+ #and a child one. The result of the method is the relative XPath of the node pointed to
126
+ #by the second XPath to the node pointed to by the firs XPath.
127
+ def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
128
+ original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
129
+ pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
130
+ i = 0
131
+ pairs.each_with_index do |pair,index|
132
+ i = index
133
+ break if pair[0] != pair[1]
134
+ end
135
+ "/" + original_child_xpath_parts[i..-1].join('/')
136
+ end
137
+
158
138
  private
159
139
  #Find the index of the child inside the parent
160
140
  #For example:
@@ -189,50 +169,21 @@ private
189
169
  end
190
170
  path
191
171
  end
192
-
193
- def self.traverse_for_node_text(node, text)
194
- return if @found
195
- if (node.instance_of? Hpricot::Elem)
196
- node.traverse_text do |t|
197
- if (t.to_s == text)
198
- @found = true
199
- @node = t.parent
200
- end
201
- end
202
- end
203
- node.children.each do |child|
204
- if child.instance_of? Hpricot::Elem
205
- traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
206
- end
207
- end
208
- end
209
-
210
- def self.traverse_for_full_text(node, text)
211
- return if @found
212
- if (node.instance_of? Hpricot::Elem)
213
- ft = unescape_entities(full_text(node)).strip
214
- #puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
215
- if (ft == text)
216
- @found = true
217
- @node = node
218
- end
219
- end
220
- node.children.each do |child|
221
- if child.instance_of? Hpricot::Elem
222
- traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
223
- end
224
- end
172
+
173
+ def self.first_child_node_with_attribute(node, attribute)
174
+ return if !node.instance_of? Hpricot::Elem || @node
175
+ @node = node if node.attributes[attribute]
176
+ node.children.each { |child| first_child_node_with_attribute(child, attribute) }
225
177
  end
226
178
 
227
- def self.unescape_entities(text)
228
- ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
229
- text
230
- end
231
-
232
- def self.lowest_possible_node_with_text(node, text)
233
- return if node.instance_of? Hpricot::Text
234
- @node = node if full_text(node) == text
235
- node.children.each { |child| lowest_possible_node_with_text(child, text) }
236
- end #End of method lowest_possible_node_with_text
179
+ def self.first_parent_node_with_attribute(node, attribute)
180
+ return if !node.instance_of? Hpricot::Elem || @node
181
+ @node = node if node.attributes[attribute]
182
+ first_parent_node_with_attribute(node.parent, attribute)
183
+ end
184
+
185
+ def self.to_general_XPath(xpath)
186
+ xpath.gsub(/\[.+?\]/) {""}
187
+ end #End of method to_general_XPath
237
188
  end #End of class XPathUtils
238
189
  end #End of module Scrubyt
@@ -0,0 +1,68 @@
1
+ require 'scrubyt'
2
+ require 'test/unit'
3
+
4
+ class SimpleExampleLookupTest
5
+
6
+ def setup
7
+ doc1 = <<-DOC
8
+ <a>
9
+ <b>
10
+ <c/>
11
+ <d>dddd</d>
12
+ <e>
13
+ <f>fff</f>
14
+ <k>kk</k>
15
+ <j/>
16
+ <l>lll</l>
17
+ <m/>
18
+ <n>nnn</n>
19
+ <n>nnnnnn</n>
20
+ <n>
21
+ nnnnnnnnn
22
+ <q/>
23
+ <r>rrr</r>
24
+ </n>
25
+ <o>ooo</o>
26
+ <n>nnnnnnnnnnnn</n>
27
+ <p>ppp</p>
28
+ </e>
29
+ </b>
30
+ <g>ggg</g>
31
+ </a>
32
+ DOC
33
+ @doc1 = Hpricot(doc1)
34
+ @a = @doc1.children[1]
35
+ @b = @a.children[1]
36
+ @c = @b.children[1]
37
+ @d = @b.children[3]
38
+ @e = @b.children[5]
39
+ @f = @e.children[1]
40
+ @g = @a.children[@a.children.size-2]
41
+ @k = @e.children[3]
42
+ @j = @e.children[5]
43
+ @l = @e.children[7]
44
+ @m = @e.children[9]
45
+ @n_1 = @e.children[11]
46
+ @n_2 = @e.children[13]
47
+ @n_3 = @e.children[15]
48
+ @o = @e.children[17]
49
+ @n_4 = @e.children[19]
50
+ @p = @e.children[21]
51
+ @q = @n_3.children[1]
52
+ @r = @n_3.children[3]
53
+ #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
54
+ end
55
+
56
+ def test_find_node_from_text
57
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
58
+ assert_instance_of(Hpricot::Elem, elem)
59
+ assert_equal(elem, @f)
60
+
61
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
62
+ assert_equal(elem, @d)
63
+
64
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
65
+ assert_equal(elem, @r)
66
+
67
+ end
68
+ end
@@ -53,20 +53,7 @@ class XPathUtilsTest < Test::Unit::TestCase
53
53
  @r = @n_3.children[3]
54
54
  #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
55
55
  end
56
-
57
- def test_find_node_from_text
58
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
59
- assert_instance_of(Hpricot::Elem, elem)
60
- assert_equal(elem, @f)
61
-
62
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
63
- assert_equal(elem, @d)
64
-
65
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
66
- assert_equal(elem, @r)
67
56
 
68
- end
69
-
70
57
  def test_lowest_common_ancestor
71
58
  lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
72
59
  lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.3
7
- date: 2007-02-20 00:00:00 +01:00
6
+ version: 0.2.6
7
+ date: 2007-03-25 00:00:00 +01:00
8
8
  summary: A powerful Web-scraping framework
9
9
  require_paths:
10
10
  - lib
@@ -34,19 +34,24 @@ files:
34
34
  - CHANGELOG
35
35
  - Rakefile
36
36
  - lib/scrubyt.rb
37
+ - lib/scrubyt/utils/shared_utils.rb
37
38
  - lib/scrubyt/utils/xpathutils.rb
39
+ - lib/scrubyt/utils/simple_example_lookup.rb
40
+ - lib/scrubyt/utils/compound_example_lookup.rb
38
41
  - lib/scrubyt/output/result_dumper.rb
39
42
  - lib/scrubyt/output/export.rb
40
43
  - lib/scrubyt/output/post_processor.rb
41
44
  - lib/scrubyt/output/result.rb
42
- - lib/scrubyt/core/navigation/fetch_action.rb
43
45
  - lib/scrubyt/core/navigation/navigation_actions.rb
46
+ - lib/scrubyt/core/navigation/fetch_action.rb
44
47
  - lib/scrubyt/core/scraping/result_indexer.rb
45
48
  - lib/scrubyt/core/scraping/constraint_adder.rb
46
49
  - lib/scrubyt/core/scraping/constraint.rb
47
50
  - lib/scrubyt/core/scraping/filter.rb
48
51
  - lib/scrubyt/core/scraping/pattern.rb
49
52
  - lib/scrubyt/core/scraping/pre_filter_document.rb
53
+ - lib/scrubyt/core/scraping/compound_example.rb
54
+ - lib/scrubyt/core/shared/u_r_i_builder.rb
50
55
  - lib/scrubyt/core/shared/evaluation_context.rb
51
56
  - lib/scrubyt/core/shared/extractor.rb
52
57
  test_files:
@@ -56,6 +61,7 @@ test_files:
56
61
  - test/unittests/extractor_test.rb
57
62
  - test/unittests/xpathutils_test.rb
58
63
  - test/unittests/constraint_test.rb
64
+ - test/unittests/simple_example_lookup_test.rb
59
65
  - test/unittests/input/constraint_test.html
60
66
  - test/unittests/input/test.html
61
67
  rdoc_options: []