scrubyt 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,13 +134,5 @@ module Scrubyt
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
135
  end
136
136
 
137
- def to_sexp
138
- if @example =~ /.+\[@.+\]$/
139
- [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
140
- else
141
- [:str, @xpath]
142
- end
143
- end
144
-
145
137
  end #End of class TreeFilter
146
138
  end #End of module Scrubyt
@@ -35,6 +35,10 @@ module Scrubyt
35
35
 
36
36
  VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
37
 
38
+ # :determine - default value, represent that type of example need determine
39
+ # :string - represent node with example type EXAMPLE_TYPE_STRING
40
+ VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
41
+
38
42
  #The pattern can be either a model pattern (in this case it is
39
43
  #written to the output) or a temp pattern (in this case it is skipped)
40
44
  #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
@@ -59,7 +63,7 @@ module Scrubyt
59
63
 
60
64
  option_reader(:type => :tree, :output_type => :model, :generalize => false,
61
65
  :write_text => lambda { @children.size == 0 }, :limit => nil,
62
- :default => nil, :resolve => :full, :except => nil, :example_type => nil)
66
+ :default => nil, :resolve => :full, :except => nil, :example_type => :determine)
63
67
 
64
68
  def initialize(name, args=[], extractor=nil, parent=nil, &block)
65
69
  #init attributes
@@ -305,32 +309,6 @@ module Scrubyt
305
309
  end
306
310
  end
307
311
 
308
- def to_sexp
309
- #collect arguments
310
- args = []
311
- args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
312
- args.push(@options.to_sexp) if !@options.empty?
313
-
314
- #build main call
315
- sexp = [:fcall, @name, [:array, *args]]
316
-
317
- if type == :detail_page
318
- #add detail page extractor
319
- sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
320
- else
321
- #add child block if the pattern has children
322
- sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
323
- end
324
-
325
- #add modifier calls - TODO: remove when everything is exported to the options hash
326
- @modifier_calls.each do |modifier_sexp|
327
- sexp = [:call, sexp, *modifier_sexp]
328
- end
329
-
330
- #return complete sexp
331
- sexp
332
- end
333
-
334
312
  private
335
313
  def parse_options_hash(hash)
336
314
  #merge provided hash
@@ -339,6 +317,7 @@ module Scrubyt
339
317
  hash.each { |key, value| check_option(key.to_sym) }
340
318
  raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
341
319
  raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
320
+ raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
342
321
  end
343
322
 
344
323
  def check_option(option)
@@ -48,10 +48,6 @@ module Scrubyt
48
48
  ary
49
49
  end
50
50
 
51
- # def to_sexp
52
- # [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
53
- # end
54
-
55
51
  private
56
52
  ##
57
53
  #Do not return the whole result set, just specified indices - like
@@ -14,6 +14,21 @@ module Scrubyt
14
14
 
15
15
  #The definition of the extractor is passed through this method
16
16
  def self.define(mode=nil, &extractor_definition)
17
+ if mode.is_a?(Hash)
18
+ if mode[:agent]==:firefox
19
+ FetchAction.class_eval do
20
+ include Navigation::Firewatir
21
+ end
22
+ else
23
+ FetchAction.class_eval do
24
+ include Navigation::Mechanize
25
+ end
26
+ end
27
+ else
28
+ FetchAction.class_eval do
29
+ include Navigation::Mechanize
30
+ end
31
+ end
17
32
  extractor = self.new(mode, extractor_definition)
18
33
  extractor.result
19
34
  end
@@ -117,7 +132,6 @@ module Scrubyt
117
132
  catch :quit_next_page_loop do
118
133
  loop do
119
134
  url = get_current_doc_url #TODO need absolute address here 2/4
120
- puts url
121
135
  @processed_pages << url
122
136
  @root_patterns.each do |root_pattern|
123
137
  root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
@@ -23,7 +23,11 @@ module Scrubyt
23
23
  text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
24
  text = SharedUtils.unescape_entities(text)
25
25
  text.strip!
26
- text
26
+ if (@options[:default] && ((text == '') || (text == @options[:default])))
27
+ @options[:default]
28
+ else
29
+ text
30
+ end
27
31
  end
28
32
 
29
33
  def to_libxml
@@ -41,26 +45,54 @@ module Scrubyt
41
45
  def to_hash(delimiter=',')
42
46
  result = []
43
47
  flat_hash_inner = lambda {|e, hash|
44
- hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
48
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
45
49
  e.each {|c| flat_hash_inner.call(c, hash) }
46
50
  hash
47
51
  }
48
52
  self.each {|e| result << flat_hash_inner.call(e, {}) }
49
53
  result
50
54
  end
55
+
56
+ def to_flat_hash()
57
+ hash_result = self.to_hash('@@@@@@')
58
+ merged_hash = hash_result.delete_at 0
59
+ hash_result.each do |hash|
60
+ merged_hash.keys.each do |key|
61
+ merged_hash[key] += "@@@@@@#{hash[key]}"
62
+ end
63
+ end
64
+ result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
65
+ final_result = []
66
+
67
+ result_sets.each do |rs|
68
+ temp_result = {}
69
+ merged_hash.keys.each do |k|
70
+ temp_result[k] = rs[merged_hash.keys.index(k)]
71
+ end
72
+ final_result << temp_result
73
+ end
74
+ final_result
75
+ end
51
76
 
52
77
  def to_flat_xml(delimiter=nil)
53
78
  lines = []
54
79
  hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
80
+ merged_hash = hash_result.delete_at 0
81
+
82
+ hash_result.each do |hash|
83
+ merged_hash.keys.each do |key|
84
+ merged_hash[key] += "#{delimiter}#{hash[key]}"
85
+ end
86
+ end
55
87
 
56
88
  if delimiter
57
- result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
89
+ result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
58
90
  final_result = []
59
91
 
60
92
  result_sets.each do |rs|
61
93
  temp_result = {}
62
- hash_result[0].keys.each do |k|
63
- temp_result[k] = rs[hash_result[0].keys.index(k)]
94
+ merged_hash.keys.each do |k|
95
+ temp_result[k] = rs[merged_hash.keys.index(k)]
64
96
  end
65
97
  final_result << temp_result
66
98
  end
@@ -89,7 +121,11 @@ module Scrubyt
89
121
  elsif write_text && !to_s.empty?
90
122
  lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
91
123
  else
92
- lines << "<#{name}/>"
124
+ if @options[:default]
125
+ lines << "<#{name}>#{@options[:default]}</#{name}>"
126
+ else
127
+ lines << "<#{name}/>"
128
+ end
93
129
  end
94
130
  else
95
131
  lines << "<#{name}>"
@@ -2,36 +2,41 @@ module Scrubyt
2
2
  class ScrubytResult < ResultNode
3
3
  attr_accessor :root_patterns, :source_file, :source_proc
4
4
 
5
- def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
6
- if arg1.is_a? String
7
- if File.exists? arg1
8
- export_old1(arg1, output_file_name, extractor_result_file_name)
9
- else
10
- export_old2(arg1, output_file_name, extractor_result_file_name)
11
- end
12
- else
13
- export_new(arg1)
14
- end
15
- end
16
-
17
- def show_stats
18
- #Implement me...
19
- end
20
-
21
- def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
22
- contents = open(input_file).read
23
- wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
24
- export_old2(wrapper_name, output_file_name, extractor_result_file_name)
25
- end
26
-
27
- def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
28
- export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
29
- end
30
-
31
- def export_new(data)
32
- data[:source_file] = @source_file
33
- data[:source_proc] = @source_proc
34
- Scrubyt::Export.export(@root_patterns, data)
5
+ def export
6
+ #Temporary solution; the real one will be back later - or not
7
+ result = <<-EXPLANATION
8
+
9
+ === Extractor tree ===
10
+
11
+ export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
12
+ For now, in case you are using examples, you can replace them by hand based on the output below.
13
+ So if your pattern in the learning extractor looks like
14
+
15
+ book "Ruby Cookbook"
16
+
17
+ and you see the following below:
18
+
19
+ [book] /table[1]/tr/td[2]
20
+
21
+ then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
22
+
23
+ EXPLANATION
24
+
25
+ tree_builder = lambda do |node, level|
26
+ result += current_level = (" " * (level == 0 ? 0 : level-1) +
27
+ "|\n" * (level == 0 ? 0 : 1) +
28
+ " " * (level == 0 ? 0 : level-1) +
29
+ "+-- " * (level == 0 ? 0 : 1) +
30
+ "[#{node.name}]")
31
+ result += " #{node.filters[0].xpath}" if node.type == :tree
32
+ result += "\n"
33
+
34
+ node.children.each {|c| tree_builder[c, level+1]}
35
+ end
36
+
37
+ tree_builder[root_patterns[0],0]
38
+
39
+ result += "\n"
35
40
  end
36
41
  end
37
42
  end
@@ -65,57 +65,4 @@ class String
65
65
  def write(stringio, add_indent=0)
66
66
  stringio.write((self.split("\n").collect { |line| (' ' * add_indent) + line }).join("\n"))
67
67
  end
68
- end
69
-
70
- class Array
71
- def to_sexp
72
- [:array, *to_sexp_array]
73
- end
74
-
75
- def to_sexp_array
76
- collect { |element| element.to_sexp }
77
- end
78
- end
79
-
80
- class Hash
81
- def to_sexp
82
- [:hash, *to_sexp_array]
83
- end
84
-
85
- def to_sexp_array
86
- sexp = []
87
- each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
88
- sexp
89
- end
90
- end
91
-
92
- class Symbol
93
- def to_sexp
94
- [:lit, self]
95
- end
96
- end
97
-
98
- class String
99
- def to_sexp
100
- [:str, self]
101
- end
102
- end
103
-
104
- class TrueClass
105
- def to_sexp
106
- [:true]
107
- end
108
- end
109
-
110
- class FalseClass
111
- def to_sexp
112
- [:false]
113
- end
114
- end
115
-
116
- class Proc
117
- alias_method :parse_tree_to_sexp, :to_sexp
118
- def to_sexp
119
- [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
120
- end
121
68
  end
@@ -107,7 +107,8 @@ module Scrubyt
107
107
  #find the <form> node which is the parent of the <input> node
108
108
  def self.traverse_up_until_name(node, name)
109
109
  while node.class != Hpricot::Doc do
110
- raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
110
+ #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
111
+ return nil unless node
111
112
  break if node.name == name
112
113
  node = node.parent
113
114
  end
metadata CHANGED
@@ -1,87 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
3
- specification_version: 1
4
2
  name: scrubyt
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.3.4
7
- date: 2007-09-26 00:00:00 +02:00
8
- summary: A powerful Web-scraping framework built on Mechanize and Hpricot
9
- require_paths:
10
- - lib
11
- email: peter@rubyrailways.com
12
- homepage: http://www.scrubyt.org
13
- rubyforge_project:
14
- description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: false
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.4.1
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Peter Szinek
31
- files:
32
- - COPYING
33
- - README
34
- - CHANGELOG
35
- - Rakefile
36
- - lib/scrubyt.rb
37
- - lib/scrubyt/logging.rb
38
- - lib/scrubyt/output/result_dumper.rb
39
- - lib/scrubyt/output/result.rb
40
- - lib/scrubyt/output/export.rb
41
- - lib/scrubyt/output/post_processor.rb
42
- - lib/scrubyt/output/result_node.rb
43
- - lib/scrubyt/output/scrubyt_result.rb
44
- - lib/scrubyt/utils/compound_example_lookup.rb
45
- - lib/scrubyt/utils/simple_example_lookup.rb
46
- - lib/scrubyt/utils/ruby_extensions.rb
47
- - lib/scrubyt/utils/xpathutils.rb
48
- - lib/scrubyt/utils/shared_utils.rb
49
- - lib/scrubyt/core/navigation/navigation_actions.rb
50
- - lib/scrubyt/core/navigation/fetch_action.rb
51
- - lib/scrubyt/core/scraping/constraint.rb
52
- - lib/scrubyt/core/scraping/pattern.rb
53
- - lib/scrubyt/core/scraping/pre_filter_document.rb
54
- - lib/scrubyt/core/scraping/compound_example.rb
55
- - lib/scrubyt/core/scraping/constraint_adder.rb
56
- - lib/scrubyt/core/scraping/result_indexer.rb
57
- - lib/scrubyt/core/scraping/filters/attribute_filter.rb
58
- - lib/scrubyt/core/scraping/filters/base_filter.rb
59
- - lib/scrubyt/core/scraping/filters/regexp_filter.rb
60
- - lib/scrubyt/core/scraping/filters/tree_filter.rb
61
- - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
62
- - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
63
- - lib/scrubyt/core/scraping/filters/download_filter.rb
64
- - lib/scrubyt/core/scraping/filters/text_filter.rb
65
- - lib/scrubyt/core/scraping/filters/constant_filter.rb
66
- - lib/scrubyt/core/scraping/filters/script_filter.rb
67
- - lib/scrubyt/core/shared/extractor.rb
68
- test_files: []
69
-
70
- rdoc_options: []
71
-
72
- extra_rdoc_files: []
73
-
74
- executables: []
75
-
76
- extensions: []
77
-
78
- requirements: []
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
79
11
 
12
+ date: 2008-12-10 00:00:00 +01:00
13
+ default_executable:
80
14
  dependencies:
81
15
  - !ruby/object:Gem::Dependency
82
16
  name: hpricot
17
+ type: :runtime
83
18
  version_requirement:
84
- version_requirements: !ruby/object:Gem::Version::Requirement
19
+ version_requirements: !ruby/object:Gem::Requirement
85
20
  requirements:
86
21
  - - ">="
87
22
  - !ruby/object:Gem::Version
@@ -89,55 +24,85 @@ dependencies:
89
24
  version:
90
25
  - !ruby/object:Gem::Dependency
91
26
  name: mechanize
27
+ type: :runtime
92
28
  version_requirement:
93
- version_requirements: !ruby/object:Gem::Version::Requirement
29
+ version_requirements: !ruby/object:Gem::Requirement
94
30
  requirements:
95
31
  - - ">="
96
32
  - !ruby/object:Gem::Version
97
33
  version: 0.6.3
98
34
  version:
99
- - !ruby/object:Gem::Dependency
100
- name: ParseTreeReloaded
101
- version_requirement:
102
- version_requirements: !ruby/object:Gem::Version::Requirement
103
- requirements:
104
- - - ">"
105
- - !ruby/object:Gem::Version
106
- version: 0.0.0
107
- version:
108
- - !ruby/object:Gem::Dependency
109
- name: RubyInlineAcceleration
110
- version_requirement:
111
- version_requirements: !ruby/object:Gem::Version::Requirement
112
- requirements:
113
- - - ">"
114
- - !ruby/object:Gem::Version
115
- version: 0.0.0
116
- version:
117
- - !ruby/object:Gem::Dependency
118
- name: RubyInline
119
- version_requirement:
120
- version_requirements: !ruby/object:Gem::Version::Requirement
121
- requirements:
122
- - - "="
123
- - !ruby/object:Gem::Version
124
- version: 3.6.3
125
- version:
126
- - !ruby/object:Gem::Dependency
127
- name: ParseTree
128
- version_requirement:
129
- version_requirements: !ruby/object:Gem::Version::Requirement
130
- requirements:
131
- - - "="
132
- - !ruby/object:Gem::Version
133
- version: 1.7.1
134
- version:
135
- - !ruby/object:Gem::Dependency
136
- name: ruby2ruby
137
- version_requirement:
138
- version_requirements: !ruby/object:Gem::Version::Requirement
139
- requirements:
140
- - - "="
141
- - !ruby/object:Gem::Version
142
- version: 1.1.6
143
- version:
35
+ description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
36
+ email: peter@rubyrailways.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - COPYING
45
+ - README
46
+ - CHANGELOG
47
+ - Rakefile
48
+ - lib/scrubyt/core/navigation/agents/firewatir.rb
49
+ - lib/scrubyt/core/navigation/agents/mechanize.rb
50
+ - lib/scrubyt/core/navigation/fetch_action.rb
51
+ - lib/scrubyt/core/navigation/navigation_actions.rb
52
+ - lib/scrubyt/core/scraping/compound_example.rb
53
+ - lib/scrubyt/core/scraping/constraint.rb
54
+ - lib/scrubyt/core/scraping/constraint_adder.rb
55
+ - lib/scrubyt/core/scraping/filters/attribute_filter.rb
56
+ - lib/scrubyt/core/scraping/filters/base_filter.rb
57
+ - lib/scrubyt/core/scraping/filters/constant_filter.rb
58
+ - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
59
+ - lib/scrubyt/core/scraping/filters/download_filter.rb
60
+ - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
61
+ - lib/scrubyt/core/scraping/filters/regexp_filter.rb
62
+ - lib/scrubyt/core/scraping/filters/script_filter.rb
63
+ - lib/scrubyt/core/scraping/filters/text_filter.rb
64
+ - lib/scrubyt/core/scraping/filters/tree_filter.rb
65
+ - lib/scrubyt/core/scraping/pattern.rb
66
+ - lib/scrubyt/core/scraping/pre_filter_document.rb
67
+ - lib/scrubyt/core/scraping/result_indexer.rb
68
+ - lib/scrubyt/core/shared/extractor.rb
69
+ - lib/scrubyt/logging.rb
70
+ - lib/scrubyt/output/post_processor.rb
71
+ - lib/scrubyt/output/result.rb
72
+ - lib/scrubyt/output/result_dumper.rb
73
+ - lib/scrubyt/output/result_node.rb
74
+ - lib/scrubyt/output/scrubyt_result.rb
75
+ - lib/scrubyt/utils/compound_example_lookup.rb
76
+ - lib/scrubyt/utils/ruby_extensions.rb
77
+ - lib/scrubyt/utils/shared_utils.rb
78
+ - lib/scrubyt/utils/simple_example_lookup.rb
79
+ - lib/scrubyt/utils/xpathutils.rb
80
+ - lib/scrubyt.rb
81
+ has_rdoc: "true"
82
+ homepage: http://www.scrubyt.org
83
+ post_install_message:
84
+ rdoc_options: []
85
+
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ version:
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: "0"
99
+ version:
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.1
104
+ signing_key:
105
+ specification_version: 2
106
+ summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)
107
+ test_files: []
108
+