sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/CHANGELOG +350 -0
  2. data/COPYING +340 -0
  3. data/README +121 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +45 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +168 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +140 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +117 -0
@@ -0,0 +1,44 @@
1
+ ########################################## NOT USED ANY MORE ##########################################
2
+ module Scrubyt
3
+ ##
4
+ #=<tt>Represents the results of a pattern</tt>
5
+ class Result
6
+ attr_reader :childmap, :instances
7
+
8
+ def initialize
9
+ @childmap ||= []
10
+ end
11
+
12
+ def add_result(source, result)
13
+ @childmap.each do |hash|
14
+ if hash.keys[0] == source
15
+ hash[source] << result if !hash[source].include? result
16
+ return
17
+ end
18
+ end
19
+ @childmap << {source => [result]}
20
+ end
21
+
22
+ def lookup(last_result)
23
+ @childmap.each do |hashes|
24
+ hashes.each { |key, value| return value if (key == last_result) }
25
+ end
26
+ nil
27
+ end#end of method lookup
28
+ end#end of class Result
29
+ end#end of module Scrubyt
30
+
31
+ #It roughly works like this:
32
+ #
33
+ # root
34
+ # source: nil
35
+ # childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
36
+
37
+ #table
38
+ # source: doc1
39
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
40
+
41
+ #row
42
+ # source: table1s1, table2s1, table3s1
43
+ # childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
44
+ # {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
@@ -0,0 +1,154 @@
1
+ require 'rexml/document'
2
+ require 'rexml/xpath'
3
+
4
+ ########################################## NOT USED ANY MORE ##########################################
5
+ module Scrubyt
6
+ ##
7
+ #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
8
+ class ResultDumper
9
+ ##
10
+ #Output the results as XML
11
+ def self.to_xml(pattern)
12
+ doc = REXML::Document.new
13
+ root = REXML::Element.new('root')
14
+ doc.add_element(root)
15
+ all_extracted_docs = pattern.last_result
16
+ [all_extracted_docs].flatten.each do |lr|
17
+ pattern.last_result = lr
18
+ to_xml_recursive(pattern, root)
19
+ end
20
+ remove_empty_leaves(doc)
21
+ @@last_doc = doc
22
+ end
23
+
24
+ def self.remove_empty_leaves(node)
25
+ node.remove if node.elements.empty? && node.text == nil
26
+ node.elements.each {|child| remove_empty_leaves child }
27
+ end
28
+
29
+ ##
30
+ #Output the text of the pattern; If this pattern is a tree, collect the text from its
31
+ #result instance node; otherwise rely on the last_result
32
+ #TODO: throw this away!!!
33
+ def self.to_text(pattern)
34
+ last_result = pattern.last_result
35
+ result = ""
36
+ if pattern.type == :tree
37
+ last_result.traverse_text { |t| result += t.to_s }
38
+ else
39
+ result = last_result
40
+ end
41
+ result
42
+ end
43
+
44
+ def self.to_csv(pattern)
45
+ result = []
46
+ flat_csv_inner = lambda {|e, parts|
47
+ content = e.text || ''
48
+ parts << content if ((e.is_a? REXML::Element) && content != '')
49
+ e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
50
+ parts
51
+ }
52
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
53
+ (result.map! {|a| a.join(',')}).join("\n")
54
+ end
55
+
56
+ def self.to_hash(pattern)
57
+ result = []
58
+ flat_hash_inner = lambda {|e, parts|
59
+ content = e.text ? REXML::Text.unnormalize(e.text) : ''
60
+ if ((e.is_a? REXML::Element) && content != '')
61
+ if parts[e.local_name]
62
+ parts[e.local_name] = parts[e.local_name] + "," + content
63
+ else
64
+ parts[e.local_name] = content
65
+ end
66
+ end
67
+ e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
68
+ parts
69
+ }
70
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
71
+ result
72
+ end
73
+
74
+
75
+
76
+ ##
77
+ #Print some simple statistics on the extracted results, like the count of extracted
78
+ #instances by each pattern
79
+ def self.print_statistics(pattern)
80
+ puts "\n" * 2
81
+ print_statistics_recursive(pattern,0)
82
+ puts
83
+ end
84
+
85
+ private
86
+ def self.to_xml_recursive(pattern, element)
87
+ pattern.children.each do |child|
88
+ childresults = child.result.lookup(child.parent.last_result)
89
+ #Output text for leaf nodes only; Maybe add possibility to customize this later
90
+ if (childresults == nil)
91
+ ##TODO: is this needed for anything? I guess not! Drop it!!!!!!
92
+ #Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
93
+ ##=begin
94
+ res = ""
95
+ if child.parent.last_result.is_a? String
96
+ res = child.parent.last_result
97
+ else
98
+ child.parent.last_result.traverse_text { |t| res += t.to_s }
99
+ end
100
+ if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
101
+ element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
102
+ end
103
+ next
104
+ ##=end
105
+ end
106
+
107
+ generate_children(child, childresults, element)
108
+ end
109
+ end
110
+
111
+ def self.generate_children(child, childresults, element)
112
+ if childresults == nil
113
+ child_node = REXML::Element.new(child.name)
114
+ child_node.text = child.default
115
+ element.add_element(child_node)
116
+ else
117
+ childresults.size.times do |num|
118
+ child.last_result = childresults[num]
119
+ res = ""
120
+ if child.last_result.instance_of? String
121
+ res = child.last_result
122
+ else
123
+ if child.last_result.respond_to? 'traverse_text'
124
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
125
+ else
126
+ child.last_result.children.each { |c| element.add_element c }
127
+ end
128
+ end
129
+ child_node = REXML::Element.new(child.name)
130
+ child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
131
+ element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
132
+ to_xml_recursive(child, child_node)
133
+ end
134
+ end
135
+ end
136
+
137
+ def self.print_statistics_recursive(pattern, depth)
138
+ if pattern.name != 'root'
139
+ if pattern.type == :detail_page
140
+ pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
141
+ print_statistics_recursive(child, depth)
142
+ end
143
+ else
144
+ count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
145
+ Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
146
+ end
147
+ end
148
+
149
+ pattern.children.each do |child|
150
+ print_statistics_recursive(child, depth + 4)
151
+ end
152
+ end#end of method print_statistics_recursive
153
+ end #end of class ResultDumper
154
+ end #end of module Scrubyt
@@ -0,0 +1,140 @@
1
+ module Scrubyt
2
+ class ResultNode < Array
3
+ OUTPUT_OPTIONS = [:write_text]
4
+
5
+ attr_accessor :name, :result, :options, :generated_by_leaf
6
+
7
+ def initialize(name, result=nil, options={})
8
+ @name = name
9
+ @result = result
10
+ @options = options
11
+ end
12
+
13
+ def write_text
14
+ @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
15
+ end
16
+
17
+ def has_content?
18
+ return true if result.is_a? String
19
+ write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
20
+ end
21
+
22
+ def to_s
23
+ text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
+ text = SharedUtils.unescape_entities(text)
25
+ text.strip!
26
+ if (@options[:default] && ((text == '') || (text == @options[:default])))
27
+ @options[:default]
28
+ else
29
+ text
30
+ end
31
+ end
32
+
33
+ def to_libxml
34
+ libxml_node = XML::Node.new(name)
35
+ self.each { |child| libxml_node << child.to_libxml if child.has_content? }
36
+ libxml_node << to_s if write_text
37
+ libxml_node
38
+ end
39
+
40
+ #note: see ruby_extensions.rb for String#write
41
+ def to_xml
42
+ to_xml_lines.join("\n")
43
+ end
44
+
45
+ def to_hash(delimiter=',')
46
+ result = []
47
+ flat_hash_inner = lambda {|e, hash|
48
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
49
+ e.each {|c| flat_hash_inner.call(c, hash) }
50
+ hash
51
+ }
52
+ self.each {|e| result << flat_hash_inner.call(e, {}) }
53
+ result
54
+ end
55
+
56
+ def to_flat_hash()
57
+ hash_result = self.to_hash('@@@@@@')
58
+ merged_hash = hash_result.delete_at 0
59
+ hash_result.each do |hash|
60
+ merged_hash.keys.each do |key|
61
+ merged_hash[key] += "@@@@@@#{hash[key]}"
62
+ end
63
+ end
64
+ result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
65
+ final_result = []
66
+
67
+ result_sets.each do |rs|
68
+ temp_result = {}
69
+ merged_hash.keys.each do |k|
70
+ temp_result[k] = rs[merged_hash.keys.index(k)]
71
+ end
72
+ final_result << temp_result
73
+ end
74
+ final_result
75
+ end
76
+
77
+ def to_flat_xml(delimiter=nil)
78
+ lines = []
79
+ hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
80
+ merged_hash = hash_result.delete_at 0
81
+
82
+ hash_result.each do |hash|
83
+ merged_hash.keys.each do |key|
84
+ merged_hash[key] += "#{delimiter}#{hash[key]}"
85
+ end
86
+ end
87
+
88
+ if delimiter
89
+ result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
90
+ final_result = []
91
+
92
+ result_sets.each do |rs|
93
+ temp_result = {}
94
+ merged_hash.keys.each do |k|
95
+ temp_result[k] = rs[merged_hash.keys.index(k)]
96
+ end
97
+ final_result << temp_result
98
+ end
99
+ hash_result = final_result
100
+ end
101
+
102
+ hash_result.each do |hash|
103
+ lines << "<item>"
104
+ hash.each do |key, value|
105
+ xml_tag = key.to_s
106
+ value = '' if value == '#empty#'
107
+ lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
108
+ end
109
+ lines << "</item>"
110
+ end
111
+ return lines.join("\n")
112
+
113
+ end
114
+
115
+ def to_xml_lines
116
+ lines = []
117
+ children = self.select{ |child| child.has_content? }
118
+ if children.empty?
119
+ if result.is_a? String
120
+ lines << "<#{name}>#{result}</#{name}>"
121
+ elsif write_text && !to_s.empty?
122
+ lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
123
+ else
124
+ if @options[:default]
125
+ lines << "<#{name}>#{@options[:default]}</#{name}>"
126
+ else
127
+ lines << "<#{name}/>"
128
+ end
129
+ end
130
+ else
131
+ lines << "<#{name}>"
132
+ lines << " #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
133
+ children.each do |child|
134
+ lines.push(*child.to_xml_lines.map{ |line| " #{line}" })
135
+ end
136
+ lines << "</#{name}>"
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,42 @@
1
+ module Scrubyt
2
+ class ScrubytResult < ResultNode
3
+ attr_accessor :root_patterns, :source_file, :source_proc
4
+
5
+ def export
6
+ #Temporary solution; the real one will be back later - or not
7
+ result = <<-EXPLANATION
8
+
9
+ === Extractor tree ===
10
+
11
+ export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
12
+ For now, in case you are using examples, you can replace them by hand based on the output below.
13
+ So if your pattern in the learning extractor looks like
14
+
15
+ book "Ruby Cookbook"
16
+
17
+ and you see the following below:
18
+
19
+ [book] /table[1]/tr/td[2]
20
+
21
+ then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
22
+
23
+ EXPLANATION
24
+
25
+ tree_builder = lambda do |node, level|
26
+ result += current_level = (" " * (level == 0 ? 0 : level-1) +
27
+ "|\n" * (level == 0 ? 0 : 1) +
28
+ " " * (level == 0 ? 0 : level-1) +
29
+ "+-- " * (level == 0 ? 0 : 1) +
30
+ "[#{node.name}]")
31
+ result += " #{node.filters[0].xpath}" if node.type == :tree
32
+ result += "\n"
33
+
34
+ node.children.each {|c| tree_builder[c, level+1]}
35
+ end
36
+
37
+ tree_builder[root_patterns[0],0]
38
+
39
+ result += "\n"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,50 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of compund examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by compound examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class CompoundExampleLookup
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
12
+ @partial_results = []
13
+ self.lookup_compound_example(doc, compound_example, index)
14
+ end
15
+
16
+ private
17
+ #Lookup the first element which is matched by this compund example
18
+ #
19
+ #A compound example is specified with :contains, :begins_with and
20
+ #:ends_with descriptors - which can be both regexps or strings
21
+ #
22
+ #Example:
23
+ #
24
+ #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
+ def self.lookup_compound_example(doc, compound_example, index)
26
+ compound_example.each do |k,v|
27
+ v = Regexp.escape(v) if v.is_a? String
28
+ case k
29
+ when :contains
30
+ v = /#{v}/
31
+ when :begins_with
32
+ v = /^\s*#{v}/
33
+ when :ends_with
34
+ v = /#{v}\s*$/
35
+ end
36
+ if (@partial_results.empty?)
37
+ @partial_results = SharedUtils.traverse_for_match(doc, v)
38
+ else
39
+ refine_partial_results(v)
40
+ end
41
+ end
42
+ @partial_results[index]
43
+ end
44
+
45
+ def self.refine_partial_results(regexp)
46
+ @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
47
+ end
48
+
49
+ end #End of class CompoundExampleLookup
50
+ end #End of module Scrubyt
@@ -0,0 +1,85 @@
1
+ class Module
2
+ def option_reader(key_default_hash)
3
+ key_default_hash.each do |key, default|
4
+ define_method(key) {
5
+ if @options[key].nil?
6
+ if default.is_a? Proc
7
+ instance_eval(&default)
8
+ else
9
+ default
10
+ end
11
+ else
12
+ @options[key]
13
+ end
14
+ }
15
+ end
16
+ end
17
+
18
+ def option_writer(*keys)
19
+ keys.each do |key|
20
+ define_method("#{key.to_s}=".to_sym) { |value|
21
+ @options[key] = value
22
+ }
23
+ end
24
+ end
25
+
26
+ def option(key, default=nil, writable=false)
27
+ option_reader(key => default)
28
+ option_writer(key) if writable
29
+ end
30
+
31
+ def option_accessor(key_default_hash)
32
+ key_default_hash.each do |key, default|
33
+ option(key, default, true)
34
+ end
35
+ end
36
+ end
37
+
38
+ class Range
39
+ def <=>(other)
40
+ self.begin <=> other.begin
41
+ end
42
+
43
+ def +(amount)
44
+ (self.begin + amount)..(self.end + amount)
45
+ end
46
+
47
+ def -(amount)
48
+ (self.begin - amount)..(self.end - amount)
49
+ end
50
+ end
51
+
52
+ module Math
53
+ def self.min(a, b)
54
+ a < b ? a : b
55
+ end
56
+
57
+ def self.max(a, b)
58
+ a > b ? a : b
59
+ end
60
+ end
61
+
62
+ #dec 16: Dropped - causes some errors w/ Rails
63
+ #just some hack here to allow current examples' syntax:
64
+ #table_data.to_xml.write(open('result.xml', 'w'), 1)
65
+ #class String
66
+ # def write(stringio, add_indent=0)
67
+ # stringio.write((self.split("\n").collect { |line| (' ' * add_indent) + line }).join("\n"))
68
+ # end
69
+ #end
70
+
71
+ #hack to simulate ancestor::tag selector of XPAth
72
+ module Hpricot
73
+ class Elem
74
+ def ancestors(tag = nil)
75
+ element=self
76
+ path=Hpricot::Elements.new
77
+ while element.class != Hpricot::Doc do
78
+ return element if (tag && (tag ==element.name))
79
+ path.push element
80
+ element = element.parent
81
+ end
82
+ path
83
+ end
84
+ end
85
+ end