scrubyt 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/CHANGELOG +32 -2
  2. data/Rakefile +25 -20
  3. data/lib/scrubyt.rb +24 -5
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
  6. data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
  7. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
  8. data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
  9. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
  10. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
  11. data/lib/scrubyt/core/scraping/pattern.rb +82 -90
  12. data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
  13. data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
  14. data/lib/scrubyt/core/shared/extractor.rb +55 -54
  15. data/lib/scrubyt/logging.rb +16 -0
  16. data/lib/scrubyt/output/export.rb +1 -1
  17. data/lib/scrubyt/output/post_processor.rb +6 -5
  18. data/lib/scrubyt/output/result.rb +1 -0
  19. data/lib/scrubyt/output/result_dumper.rb +4 -3
  20. data/lib/scrubyt/output/result_node.rb +73 -0
  21. data/lib/scrubyt/output/scrubyt_result.rb +28 -0
  22. data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
  23. data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
  24. data/lib/scrubyt/utils/xpathutils.rb +11 -0
  25. metadata +7 -12
  26. data/test/unittests/constraint_test.rb +0 -107
  27. data/test/unittests/extractor_test.rb +0 -91
  28. data/test/unittests/filter_test.rb +0 -79
  29. data/test/unittests/input/constraint_test.html +0 -55
  30. data/test/unittests/input/test.html +0 -39
  31. data/test/unittests/pattern_test.rb +0 -27
  32. data/test/unittests/simple_example_lookup_test.rb +0 -68
  33. data/test/unittests/xpathutils_test.rb +0 -152
@@ -2,12 +2,13 @@ module Scrubyt
2
2
  ##
3
3
  #=<tt>Apply different functions on the input document</tt>
4
4
  #Before the document is passed to Hpricot for parsing, we may need
5
- #to do different stuff with it which are clumsy/not appropriate/impossible
5
+ #to do different stuff with it which are clumsy/not appropriate/impossible
6
6
  #to do once the document is loaded.
7
7
  class PreFilterDocument
8
8
  #Replace <br/> tags with newlines
9
9
  def self.br_to_newline(doc)
10
10
  doc.gsub(/<br[ \/]*>/i, "\r\n")
11
+ doc = doc.tr("\240"," ")
11
12
  end #end of function br_to_newline
12
13
  end #end of class PreFilterDocument
13
14
  end #end of module Scrubyt
@@ -5,7 +5,7 @@ module Scrubyt
5
5
  #Every kind of data that is shared among patterns during the extraction process
6
6
  #is held in this class, so it can be looked up anytime.
7
7
  #
8
- #This class provides also some high-level basic functionality in navigation, like
8
+ #This class provides also some high-level basic functionality in navigation, like
9
9
  #crawling to new pages, attaching doucment to the root pattern once arrived at the
10
10
  #desired page etc.
11
11
  #
@@ -14,7 +14,7 @@ module Scrubyt
14
14
  #and this is accomplished through EvaluationContext.
15
15
  class EvaluationContext
16
16
  attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
17
-
17
+
18
18
  def initialize
19
19
  @root_pattern = nil
20
20
  @next_page = nil
@@ -22,54 +22,31 @@ module Scrubyt
22
22
  @extractor = nil
23
23
  @evaluating_extractor_definition = false
24
24
  end
25
-
25
+
26
26
  ##
27
27
  #Crawl to a new page. This function should not be called from the outside - it is automatically called
28
28
  #if the next_page pattern is defined
29
- def crawl_to_new_page(root_pattern, uri_builder)
30
- temp_document = uri_builder.next_page_example ?
31
- generate_next_page_link(uri_builder) :
29
+ def crawl_to_new_page(uri_builder)
30
+ #puts "Crawling to new page!"
31
+ #puts "example #{uri_builder.next_page_example}"
32
+ temp_document = uri_builder.next_page_example ?
33
+ generate_next_page_link(uri_builder) :
32
34
  uri_builder.generate_next_uri
33
- return nil if temp_document == nil
34
- clear_sources_and_sinks(@root_pattern)
35
+ return false if temp_document == nil
35
36
  FetchAction.restore_host_name
36
37
  @extractor.fetch(temp_document)
37
- attach_current_document
38
+ return true
38
39
  end
39
40
 
40
- ##
41
- #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
42
- #crawling to a new page
43
- def attach_current_document
44
- doc = @extractor.get_hpricot_doc
45
- @root_pattern.filters[0].source << doc
46
- @root_pattern.filters[0].sink << doc
47
- @root_pattern.last_result ||= []
48
- @root_pattern.last_result << doc
49
- @root_pattern.result.add_result(@root_pattern.filters[0].source,
50
- @root_pattern.filters[0].sink)
51
- end
52
-
53
- ##
54
- #After crawling to the new page, the sources and sinks need to be cleaned
55
- #since they are no more valid
56
- def clear_sources_and_sinks(pattern)
57
- pattern.filters.each do |filter|
58
- filter.source = []
59
- filter.sink = []
60
- end
61
- pattern.children.each {|child| clear_sources_and_sinks child}
62
- end
63
-
64
41
  def generate_next_page_link(uri_builder)
65
- uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
42
+ return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
66
43
  xpath = uri_builder.next_page_pattern.filters[0].xpath
67
44
  node = (@extractor.get_hpricot_doc/xpath).map.last
68
45
  node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
69
- return nil if node == nil || node.attributes['href'] == nil
46
+ return nil if node == nil || node.attributes['href'] == nil
70
47
  node.attributes['href'].gsub('&amp;') {'&'}
71
- end
72
-
48
+ end
49
+
73
50
  def setup_uri_builder(pattern,args)
74
51
  if args[0] =~ /^http.+/
75
52
  args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
@@ -3,49 +3,56 @@ module Scrubyt
3
3
  #=<tt>Driving the whole extraction process</tt>
4
4
  #
5
5
  #Extractor is a performer class - it gets an extractor definition and carries
6
- #out the actions and evaluates the wrappers sequentially.
6
+ #out the actions and evaluates the wrappers sequentially.
7
7
  #
8
8
  #Originally also the navigation actions were here, but since the class got too
9
9
  #big, they were factored out to an own class, NavigationAction.
10
- class Extractor
10
+ class Extractor
11
11
  #The definition of the extractor is passed through this method
12
12
  def self.define(mode=nil, &extractor_definition)
13
13
  backtrace = SharedUtils.get_backtrace
14
14
  parts = backtrace[1].split(':')
15
15
  source_file = parts[0]
16
-
16
+
17
17
  @@mode = mode
18
18
  #We are keeping the relations between the detail patterns and their root patterns
19
19
  @@detail_extractor_to_pattern_name = {}
20
- @@detail_pattern_relations = {}
20
+ @@detail_pattern_relations = {}
21
21
  #root pattern -> URIBuilder mapping
22
22
  @@next_patterns = {}
23
23
  mode_name = (mode == :production ? 'Production' : 'Learning')
24
- puts "[MODE] #{mode_name}"
25
- NavigationActions.new
24
+
25
+ Scrubyt.log :MODE, mode_name
26
+
26
27
  @@evaluation_context = EvaluationContext.new
27
- #Hack up an artificial root pattern (i.e. do not return the pattern which
28
+ #Hack up an artificial root pattern (i.e. do not return the pattern which
28
29
  #is the root one in the user's definition, but rather the real (invisible)
29
30
  #root pattern
30
31
  @@evaluation_context.evaluating_extractor_definition = true
31
32
  class_eval(&extractor_definition)
32
33
  @@evaluation_context.evaluating_extractor_definition = false
33
34
  root_pattern = @@evaluation_context.root_pattern
35
+
34
36
  if root_pattern.nil?
35
- puts "No extractor defined, exiting..."
37
+ # TODO: this should be an exception
38
+ Scrubyt.log :ERROR, 'No extractor defined, exiting...'
36
39
  exit
37
40
  end
41
+
38
42
  root_pattern.source_file = source_file
39
43
  root_pattern.source_proc = extractor_definition
40
44
  #Once all is set up, evaluate the extractor from the root pattern!
41
- evaluate_extractor(root_pattern)
42
- #Apply all postprocess steps
43
- PostProcessor.apply_post_processing(root_pattern)
45
+ root_results = evaluate_extractor(root_pattern)
46
+
47
+ scrubyt_result = ScrubytResult.new('root')
48
+ scrubyt_result.push(*root_results)
49
+ scrubyt_result.root_pattern = root_pattern
50
+
44
51
  #Return the root pattern
45
- puts "Extraction finished succesfully!"
46
- root_pattern
52
+ Scrubyt.log :INFO, 'Extraction finished succesfully!'
53
+ scrubyt_result
47
54
  end
48
-
55
+
49
56
  #Evaluate a subexttractor (i.e. an extractor on a detail page).
50
57
  #The url passed to this function is automatically loaded.
51
58
  #The definition of the subextractor is passed as a block
@@ -53,119 +60,113 @@ module Scrubyt
53
60
  #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
54
61
  def self.evaluate_subextractor(url, parent_pattern, resolve)
55
62
  if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
56
- detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
57
- detail_root.result = Result.new
63
+ detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
58
64
  detail_root.last_result = nil
59
65
  FetchAction.store_page
60
66
  @@original_evaluation_context.push @@evaluation_context
61
67
  @@host_stack.push FetchAction.get_host_name
62
68
  @@evaluation_context = EvaluationContext.new
63
- @@evaluation_context.clear_sources_and_sinks detail_root
64
69
  FetchAction.restore_host_name
65
70
  fetch url, :resolve => resolve
66
71
  @@evaluation_context.extractor = self
67
- @@evaluation_context.root_pattern = detail_root
68
- @@evaluation_context.attach_current_document
69
- evaluate_extractor detail_root
72
+ @@evaluation_context.root_pattern = detail_root
73
+ root_results = evaluate_extractor detail_root
70
74
  @@evaluation_context = @@original_evaluation_context.pop
71
75
  FetchAction.restore_page
72
76
  FetchAction.store_host_name(@@host_stack.pop)
73
- detail_root.to_xml
74
- else
77
+ root_results
78
+ else
75
79
  @@original_evaluation_context ||= []
76
80
  @@host_stack ||= []
77
81
  FetchAction.store_page
78
82
  @@original_evaluation_context.push @@evaluation_context
79
83
  @@host_stack.push FetchAction.get_host_name
80
84
  @@evaluation_context = EvaluationContext.new
81
- FetchAction.restore_host_name
85
+ FetchAction.restore_host_name
82
86
  fetch url, :resolve => resolve
83
- evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
84
- root_pattern = evaluated_extractor.parent
85
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
86
- evaluate_extractor(root_pattern)
87
- #Apply all postprocess steps
88
- PostProcessor.apply_post_processing(root_pattern)
87
+ class_eval(&parent_pattern.referenced_extractor)
88
+ root_pattern = @@evaluation_context.root_pattern
89
+ @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
90
+ root_results = evaluate_extractor(root_pattern)
89
91
  @@evaluation_context = @@original_evaluation_context.pop
90
92
  FetchAction.restore_page
91
93
  FetchAction.store_host_name(@@host_stack.pop)
92
- root_pattern.to_xml
94
+ root_results
93
95
  end
94
96
  end
95
-
96
- #build the current wrapper
97
+
98
+ #build the current wrapper
97
99
  def self.method_missing(method_name, *args, &block)
98
100
  if NavigationActions::KEYWORDS.include? method_name.to_s
99
101
  NavigationActions.send(method_name, *args)
100
102
  return
101
103
  end
104
+
102
105
  if method_name.to_s == 'next_page'
103
106
  pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
104
107
  pattern.evaluation_context = @@evaluation_context
105
-
108
+
106
109
  @@evaluation_context.setup_uri_builder(pattern, args)
107
110
  @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
108
111
  else
109
112
  raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
110
113
  #Create a root pattern
111
- root_pattern = Scrubyt::Pattern.new('root', [:type => :root], @@evaluation_context)
114
+ @@evaluation_context.extractor = self
115
+ root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
112
116
  @@last_root_pattern = root_pattern
113
117
  @@evaluation_context.root_pattern = root_pattern
114
- @@evaluation_context.extractor = self
115
- #add the currently active document to the root pattern
116
- @@evaluation_context.attach_current_document
117
- pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
118
- root_pattern.children << pattern
119
- pattern
118
+ root_pattern
120
119
  end
121
120
  end
122
-
121
+
123
122
  def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
124
123
  @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
125
124
  end
126
125
 
127
126
  def self.get_detail_extractor(parent_pattern)
128
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
127
+ @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
129
128
  end
130
129
 
131
130
  def self.get_hpricot_doc
132
131
  NavigationActions.get_hpricot_doc
133
132
  end
134
-
133
+
135
134
  def self.get_current_doc_url
136
135
  NavigationActions.get_current_doc_url
137
136
  end
138
-
137
+
139
138
  def self.get_detail_pattern_relations
140
139
  @@detail_pattern_relations
141
140
  end
142
-
141
+
143
142
  def self.get_host_name
144
143
  NavigationActions.get_host_name
145
144
  end
146
-
145
+
147
146
  def self.get_mode
148
147
  @@mode
149
148
  end
150
-
149
+
151
150
  def self.get_original_host_name
152
151
  @@original_host_name
153
152
  end
154
-
153
+
155
154
  private
156
-
155
+
157
156
  def self.evaluate_extractor(root_pattern)
157
+ root_results = []
158
158
  if @@next_patterns[root_pattern]
159
159
  current_page_count = 1
160
160
  loop do
161
- root_pattern.evaluate(nil)
162
- break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]))
161
+ root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
162
+ break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(@@next_patterns[root_pattern]))
163
163
  current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
164
164
  end
165
165
  else
166
- root_pattern.evaluate(nil)
166
+ root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
167
167
  end
168
+ root_results
168
169
  end
169
-
170
+
170
171
  end #end of class Extractor
171
- end #end of module Scrubyt
172
+ end #end of module Scrubyt
@@ -0,0 +1,16 @@
1
+ module Scrubyt
2
+ def self.log(message_type, message)
3
+
4
+ pre = "[#{message_type}] "
5
+
6
+ if message.is_a? Array
7
+ puts pre + message.first
8
+ message[1..-1].each do |line|
9
+ puts ' ' * pre.length + line
10
+ end
11
+ else
12
+ puts pre + message.to_s
13
+ end
14
+
15
+ end
16
+ end
@@ -135,7 +135,7 @@ private
135
135
  end
136
136
 
137
137
  def self.export_pattern(root_pattern)
138
- root_pattern.children[0].to_sexp
138
+ root_pattern.to_sexp
139
139
  end
140
140
  end
141
141
  end
@@ -1,5 +1,6 @@
1
1
  module Scrubyt
2
2
 
3
+ ########################################## NOT USED ANY MORE ##########################################
3
4
  require 'set'
4
5
  ##
5
6
  #=<tt>Post processing results after the extraction</tt>
@@ -46,11 +47,11 @@ require 'set'
46
47
  def self.report_if_no_results(root_pattern)
47
48
  results_found = false
48
49
  root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
49
- puts
50
- puts "!!!!!! WARNING: The extractor did not find any result instances"
51
- puts "Most probably this is wrong. Check your extractor and if you are"
52
- puts "sure it should work, report a bug!"
53
- puts
50
+
51
+ Scrubyt.log :WARNING, [
52
+ "The extractor did not find any result instances. Most probably this is wrong.",
53
+ "Check your extractor and if you are sure it should work, report a bug!"
54
+ ]
54
55
  end
55
56
 
56
57
  private
@@ -1,3 +1,4 @@
1
+ ########################################## NOT USED ANY MORE ##########################################
1
2
  module Scrubyt
2
3
  ##
3
4
  #=<tt>Represents the results of a pattern</tt>
@@ -1,6 +1,7 @@
1
1
  require 'rexml/document'
2
2
  require 'rexml/xpath'
3
3
 
4
+ ########################################## NOT USED ANY MORE ##########################################
4
5
  module Scrubyt
5
6
  ##
6
7
  #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
@@ -45,7 +46,7 @@ module Scrubyt
45
46
  flat_csv_inner = lambda {|e, parts|
46
47
  content = e.text || ''
47
48
  parts << content if ((e.is_a? REXML::Element) && content != '')
48
- e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
49
+ e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
49
50
  parts
50
51
  }
51
52
  to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
@@ -55,7 +56,7 @@ module Scrubyt
55
56
  def self.to_hash(pattern)
56
57
  result = []
57
58
  flat_hash_inner = lambda {|e, parts|
58
- content = e.text || ''
59
+ content = e.text ? REXML::Text.unnormalize(e.text) : ''
59
60
  if ((e.is_a? REXML::Element) && content != '')
60
61
  if parts[e.local_name]
61
62
  parts[e.local_name] = parts[e.local_name] + "," + content
@@ -141,7 +142,7 @@ private
141
142
  end
142
143
  else
143
144
  count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
144
- puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
145
+ Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
145
146
  end
146
147
  end
147
148
 
@@ -0,0 +1,73 @@
1
+ module Scrubyt
2
+ class ResultNode < Array
3
+ OUTPUT_OPTIONS = [:write_text]
4
+
5
+ attr_accessor :name, :result, :options, :generated_by_leaf
6
+
7
+ def initialize(name, result=nil, options={})
8
+ @name = name
9
+ @result = result
10
+ @options = options
11
+ end
12
+
13
+ def write_text
14
+ @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
15
+ end
16
+
17
+ def has_content?
18
+ return true if result.is_a? String
19
+ write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
20
+ end
21
+
22
+ def to_s
23
+ text = (@result.is_a? String) ? @result : @result.inner_text
24
+ text = SharedUtils.unescape_entities(text)
25
+ text.strip!
26
+ text
27
+ end
28
+
29
+ def to_libxml
30
+ libxml_node = XML::Node.new(name)
31
+ self.each { |child| libxml_node << child.to_libxml if child.has_content? }
32
+ libxml_node << to_s if write_text
33
+ libxml_node
34
+ end
35
+
36
+ #note: see ruby_extensions.rb for String#write
37
+ def to_xml
38
+ to_xml_lines.join("\n")
39
+ end
40
+
41
+ def to_hash
42
+ result = []
43
+ flat_hash_inner = lambda {|e, hash|
44
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + "," + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
45
+ e.each {|c| flat_hash_inner.call(c, hash) }
46
+ hash
47
+ }
48
+ self.each {|e| result << flat_hash_inner.call(e, {}) }
49
+ result
50
+ end
51
+
52
+ def to_xml_lines
53
+ lines = []
54
+ children = self.select{ |child| child.has_content? }
55
+ if children.empty?
56
+ if result.is_a? String
57
+ lines << "<#{name}>#{result}</#{name}>"
58
+ elsif write_text && !to_s.empty?
59
+ lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
60
+ else
61
+ lines << "<#{name}/>"
62
+ end
63
+ else
64
+ lines << "<#{name}>"
65
+ lines << " #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
66
+ children.each do |child|
67
+ lines.push(*child.to_xml_lines.map{ |line| " #{line}" })
68
+ end
69
+ lines << "</#{name}>"
70
+ end
71
+ end
72
+ end
73
+ end