scrubyt 0.3.4 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -134,13 +134,5 @@ module Scrubyt
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
135
  end
136
136
 
137
- def to_sexp
138
- if @example =~ /.+\[@.+\]$/
139
- [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
140
- else
141
- [:str, @xpath]
142
- end
143
- end
144
-
145
137
  end #End of class TreeFilter
146
138
  end #End of module Scrubyt
@@ -35,6 +35,10 @@ module Scrubyt
35
35
 
36
36
  VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
37
 
38
+ # :determine - default value, represent that type of example need determine
39
+ # :string - represent node with example type EXAMPLE_TYPE_STRING
40
+ VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
41
+
38
42
  #The pattern can be either a model pattern (in this case it is
39
43
  #written to the output) or a temp pattern (in this case it is skipped)
40
44
  #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
@@ -59,7 +63,7 @@ module Scrubyt
59
63
 
60
64
  option_reader(:type => :tree, :output_type => :model, :generalize => false,
61
65
  :write_text => lambda { @children.size == 0 }, :limit => nil,
62
- :default => nil, :resolve => :full, :except => nil, :example_type => nil)
66
+ :default => nil, :resolve => :full, :except => nil, :example_type => :determine)
63
67
 
64
68
  def initialize(name, args=[], extractor=nil, parent=nil, &block)
65
69
  #init attributes
@@ -305,32 +309,6 @@ module Scrubyt
305
309
  end
306
310
  end
307
311
 
308
- def to_sexp
309
- #collect arguments
310
- args = []
311
- args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
312
- args.push(@options.to_sexp) if !@options.empty?
313
-
314
- #build main call
315
- sexp = [:fcall, @name, [:array, *args]]
316
-
317
- if type == :detail_page
318
- #add detail page extractor
319
- sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
320
- else
321
- #add child block if the pattern has children
322
- sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
323
- end
324
-
325
- #add modifier calls - TODO: remove when everything is exported to the options hash
326
- @modifier_calls.each do |modifier_sexp|
327
- sexp = [:call, sexp, *modifier_sexp]
328
- end
329
-
330
- #return complete sexp
331
- sexp
332
- end
333
-
334
312
  private
335
313
  def parse_options_hash(hash)
336
314
  #merge provided hash
@@ -339,6 +317,7 @@ module Scrubyt
339
317
  hash.each { |key, value| check_option(key.to_sym) }
340
318
  raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
341
319
  raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
320
+ raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
342
321
  end
343
322
 
344
323
  def check_option(option)
@@ -48,10 +48,6 @@ module Scrubyt
48
48
  ary
49
49
  end
50
50
 
51
- # def to_sexp
52
- # [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
53
- # end
54
-
55
51
  private
56
52
  ##
57
53
  #Do not return the whole result set, just specified indices - like
@@ -14,6 +14,21 @@ module Scrubyt
14
14
 
15
15
  #The definition of the extractor is passed through this method
16
16
  def self.define(mode=nil, &extractor_definition)
17
+ if mode.is_a?(Hash)
18
+ if mode[:agent]==:firefox
19
+ FetchAction.class_eval do
20
+ include Navigation::Firewatir
21
+ end
22
+ else
23
+ FetchAction.class_eval do
24
+ include Navigation::Mechanize
25
+ end
26
+ end
27
+ else
28
+ FetchAction.class_eval do
29
+ include Navigation::Mechanize
30
+ end
31
+ end
17
32
  extractor = self.new(mode, extractor_definition)
18
33
  extractor.result
19
34
  end
@@ -117,7 +132,6 @@ module Scrubyt
117
132
  catch :quit_next_page_loop do
118
133
  loop do
119
134
  url = get_current_doc_url #TODO need absolute address here 2/4
120
- puts url
121
135
  @processed_pages << url
122
136
  @root_patterns.each do |root_pattern|
123
137
  root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
@@ -23,7 +23,11 @@ module Scrubyt
23
23
  text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
24
  text = SharedUtils.unescape_entities(text)
25
25
  text.strip!
26
- text
26
+ if (@options[:default] && ((text == '') || (text == @options[:default])))
27
+ @options[:default]
28
+ else
29
+ text
30
+ end
27
31
  end
28
32
 
29
33
  def to_libxml
@@ -41,26 +45,54 @@ module Scrubyt
41
45
  def to_hash(delimiter=',')
42
46
  result = []
43
47
  flat_hash_inner = lambda {|e, hash|
44
- hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
48
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
45
49
  e.each {|c| flat_hash_inner.call(c, hash) }
46
50
  hash
47
51
  }
48
52
  self.each {|e| result << flat_hash_inner.call(e, {}) }
49
53
  result
50
54
  end
55
+
56
+ def to_flat_hash()
57
+ hash_result = self.to_hash('@@@@@@')
58
+ merged_hash = hash_result.delete_at 0
59
+ hash_result.each do |hash|
60
+ merged_hash.keys.each do |key|
61
+ merged_hash[key] += "@@@@@@#{hash[key]}"
62
+ end
63
+ end
64
+ result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
65
+ final_result = []
66
+
67
+ result_sets.each do |rs|
68
+ temp_result = {}
69
+ merged_hash.keys.each do |k|
70
+ temp_result[k] = rs[merged_hash.keys.index(k)]
71
+ end
72
+ final_result << temp_result
73
+ end
74
+ final_result
75
+ end
51
76
 
52
77
  def to_flat_xml(delimiter=nil)
53
78
  lines = []
54
79
  hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
80
+ merged_hash = hash_result.delete_at 0
81
+
82
+ hash_result.each do |hash|
83
+ merged_hash.keys.each do |key|
84
+ merged_hash[key] += "#{delimiter}#{hash[key]}"
85
+ end
86
+ end
55
87
 
56
88
  if delimiter
57
- result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
89
+ result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
58
90
  final_result = []
59
91
 
60
92
  result_sets.each do |rs|
61
93
  temp_result = {}
62
- hash_result[0].keys.each do |k|
63
- temp_result[k] = rs[hash_result[0].keys.index(k)]
94
+ merged_hash.keys.each do |k|
95
+ temp_result[k] = rs[merged_hash.keys.index(k)]
64
96
  end
65
97
  final_result << temp_result
66
98
  end
@@ -89,7 +121,11 @@ module Scrubyt
89
121
  elsif write_text && !to_s.empty?
90
122
  lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
91
123
  else
92
- lines << "<#{name}/>"
124
+ if @options[:default]
125
+ lines << "<#{name}>#{@options[:default]}</#{name}>"
126
+ else
127
+ lines << "<#{name}/>"
128
+ end
93
129
  end
94
130
  else
95
131
  lines << "<#{name}>"
@@ -2,36 +2,41 @@ module Scrubyt
2
2
  class ScrubytResult < ResultNode
3
3
  attr_accessor :root_patterns, :source_file, :source_proc
4
4
 
5
- def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
6
- if arg1.is_a? String
7
- if File.exists? arg1
8
- export_old1(arg1, output_file_name, extractor_result_file_name)
9
- else
10
- export_old2(arg1, output_file_name, extractor_result_file_name)
11
- end
12
- else
13
- export_new(arg1)
14
- end
15
- end
16
-
17
- def show_stats
18
- #Implement me...
19
- end
20
-
21
- def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
22
- contents = open(input_file).read
23
- wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
24
- export_old2(wrapper_name, output_file_name, extractor_result_file_name)
25
- end
26
-
27
- def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
28
- export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
29
- end
30
-
31
- def export_new(data)
32
- data[:source_file] = @source_file
33
- data[:source_proc] = @source_proc
34
- Scrubyt::Export.export(@root_patterns, data)
5
+ def export
6
+ #Temporary solution; the real one will be back later - or not
7
+ result = <<-EXPLANATION
8
+
9
+ === Extractor tree ===
10
+
11
+ export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
12
+ For now, in case you are using examples, you can replace them by hand based on the output below.
13
+ So if your pattern in the learning extractor looks like
14
+
15
+ book "Ruby Cookbook"
16
+
17
+ and you see the following below:
18
+
19
+ [book] /table[1]/tr/td[2]
20
+
21
+ then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
22
+
23
+ EXPLANATION
24
+
25
+ tree_builder = lambda do |node, level|
26
+ result += current_level = (" " * (level == 0 ? 0 : level-1) +
27
+ "|\n" * (level == 0 ? 0 : 1) +
28
+ " " * (level == 0 ? 0 : level-1) +
29
+ "+-- " * (level == 0 ? 0 : 1) +
30
+ "[#{node.name}]")
31
+ result += " #{node.filters[0].xpath}" if node.type == :tree
32
+ result += "\n"
33
+
34
+ node.children.each {|c| tree_builder[c, level+1]}
35
+ end
36
+
37
+ tree_builder[root_patterns[0],0]
38
+
39
+ result += "\n"
35
40
  end
36
41
  end
37
42
  end
@@ -65,57 +65,4 @@ class String
65
65
  def write(stringio, add_indent=0)
66
66
  stringio.write((self.split("\n").collect { |line| (' ' * add_indent) + line }).join("\n"))
67
67
  end
68
- end
69
-
70
- class Array
71
- def to_sexp
72
- [:array, *to_sexp_array]
73
- end
74
-
75
- def to_sexp_array
76
- collect { |element| element.to_sexp }
77
- end
78
- end
79
-
80
- class Hash
81
- def to_sexp
82
- [:hash, *to_sexp_array]
83
- end
84
-
85
- def to_sexp_array
86
- sexp = []
87
- each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
88
- sexp
89
- end
90
- end
91
-
92
- class Symbol
93
- def to_sexp
94
- [:lit, self]
95
- end
96
- end
97
-
98
- class String
99
- def to_sexp
100
- [:str, self]
101
- end
102
- end
103
-
104
- class TrueClass
105
- def to_sexp
106
- [:true]
107
- end
108
- end
109
-
110
- class FalseClass
111
- def to_sexp
112
- [:false]
113
- end
114
- end
115
-
116
- class Proc
117
- alias_method :parse_tree_to_sexp, :to_sexp
118
- def to_sexp
119
- [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
120
- end
121
68
  end
@@ -107,7 +107,8 @@ module Scrubyt
107
107
  #find the <form> node which is the parent of the <input> node
108
108
  def self.traverse_up_until_name(node, name)
109
109
  while node.class != Hpricot::Doc do
110
- raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
110
+ #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
111
+ return nil unless node
111
112
  break if node.name == name
112
113
  node = node.parent
113
114
  end
metadata CHANGED
@@ -1,87 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
3
- specification_version: 1
4
2
  name: scrubyt
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.3.4
7
- date: 2007-09-26 00:00:00 +02:00
8
- summary: A powerful Web-scraping framework built on Mechanize and Hpricot
9
- require_paths:
10
- - lib
11
- email: peter@rubyrailways.com
12
- homepage: http://www.scrubyt.org
13
- rubyforge_project:
14
- description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: false
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.4.1
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Peter Szinek
31
- files:
32
- - COPYING
33
- - README
34
- - CHANGELOG
35
- - Rakefile
36
- - lib/scrubyt.rb
37
- - lib/scrubyt/logging.rb
38
- - lib/scrubyt/output/result_dumper.rb
39
- - lib/scrubyt/output/result.rb
40
- - lib/scrubyt/output/export.rb
41
- - lib/scrubyt/output/post_processor.rb
42
- - lib/scrubyt/output/result_node.rb
43
- - lib/scrubyt/output/scrubyt_result.rb
44
- - lib/scrubyt/utils/compound_example_lookup.rb
45
- - lib/scrubyt/utils/simple_example_lookup.rb
46
- - lib/scrubyt/utils/ruby_extensions.rb
47
- - lib/scrubyt/utils/xpathutils.rb
48
- - lib/scrubyt/utils/shared_utils.rb
49
- - lib/scrubyt/core/navigation/navigation_actions.rb
50
- - lib/scrubyt/core/navigation/fetch_action.rb
51
- - lib/scrubyt/core/scraping/constraint.rb
52
- - lib/scrubyt/core/scraping/pattern.rb
53
- - lib/scrubyt/core/scraping/pre_filter_document.rb
54
- - lib/scrubyt/core/scraping/compound_example.rb
55
- - lib/scrubyt/core/scraping/constraint_adder.rb
56
- - lib/scrubyt/core/scraping/result_indexer.rb
57
- - lib/scrubyt/core/scraping/filters/attribute_filter.rb
58
- - lib/scrubyt/core/scraping/filters/base_filter.rb
59
- - lib/scrubyt/core/scraping/filters/regexp_filter.rb
60
- - lib/scrubyt/core/scraping/filters/tree_filter.rb
61
- - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
62
- - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
63
- - lib/scrubyt/core/scraping/filters/download_filter.rb
64
- - lib/scrubyt/core/scraping/filters/text_filter.rb
65
- - lib/scrubyt/core/scraping/filters/constant_filter.rb
66
- - lib/scrubyt/core/scraping/filters/script_filter.rb
67
- - lib/scrubyt/core/shared/extractor.rb
68
- test_files: []
69
-
70
- rdoc_options: []
71
-
72
- extra_rdoc_files: []
73
-
74
- executables: []
75
-
76
- extensions: []
77
-
78
- requirements: []
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
79
11
 
12
+ date: 2008-12-10 00:00:00 +01:00
13
+ default_executable:
80
14
  dependencies:
81
15
  - !ruby/object:Gem::Dependency
82
16
  name: hpricot
17
+ type: :runtime
83
18
  version_requirement:
84
- version_requirements: !ruby/object:Gem::Version::Requirement
19
+ version_requirements: !ruby/object:Gem::Requirement
85
20
  requirements:
86
21
  - - ">="
87
22
  - !ruby/object:Gem::Version
@@ -89,55 +24,85 @@ dependencies:
89
24
  version:
90
25
  - !ruby/object:Gem::Dependency
91
26
  name: mechanize
27
+ type: :runtime
92
28
  version_requirement:
93
- version_requirements: !ruby/object:Gem::Version::Requirement
29
+ version_requirements: !ruby/object:Gem::Requirement
94
30
  requirements:
95
31
  - - ">="
96
32
  - !ruby/object:Gem::Version
97
33
  version: 0.6.3
98
34
  version:
99
- - !ruby/object:Gem::Dependency
100
- name: ParseTreeReloaded
101
- version_requirement:
102
- version_requirements: !ruby/object:Gem::Version::Requirement
103
- requirements:
104
- - - ">"
105
- - !ruby/object:Gem::Version
106
- version: 0.0.0
107
- version:
108
- - !ruby/object:Gem::Dependency
109
- name: RubyInlineAcceleration
110
- version_requirement:
111
- version_requirements: !ruby/object:Gem::Version::Requirement
112
- requirements:
113
- - - ">"
114
- - !ruby/object:Gem::Version
115
- version: 0.0.0
116
- version:
117
- - !ruby/object:Gem::Dependency
118
- name: RubyInline
119
- version_requirement:
120
- version_requirements: !ruby/object:Gem::Version::Requirement
121
- requirements:
122
- - - "="
123
- - !ruby/object:Gem::Version
124
- version: 3.6.3
125
- version:
126
- - !ruby/object:Gem::Dependency
127
- name: ParseTree
128
- version_requirement:
129
- version_requirements: !ruby/object:Gem::Version::Requirement
130
- requirements:
131
- - - "="
132
- - !ruby/object:Gem::Version
133
- version: 1.7.1
134
- version:
135
- - !ruby/object:Gem::Dependency
136
- name: ruby2ruby
137
- version_requirement:
138
- version_requirements: !ruby/object:Gem::Version::Requirement
139
- requirements:
140
- - - "="
141
- - !ruby/object:Gem::Version
142
- version: 1.1.6
143
- version:
35
+ description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
36
+ email: peter@rubyrailways.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - COPYING
45
+ - README
46
+ - CHANGELOG
47
+ - Rakefile
48
+ - lib/scrubyt/core/navigation/agents/firewatir.rb
49
+ - lib/scrubyt/core/navigation/agents/mechanize.rb
50
+ - lib/scrubyt/core/navigation/fetch_action.rb
51
+ - lib/scrubyt/core/navigation/navigation_actions.rb
52
+ - lib/scrubyt/core/scraping/compound_example.rb
53
+ - lib/scrubyt/core/scraping/constraint.rb
54
+ - lib/scrubyt/core/scraping/constraint_adder.rb
55
+ - lib/scrubyt/core/scraping/filters/attribute_filter.rb
56
+ - lib/scrubyt/core/scraping/filters/base_filter.rb
57
+ - lib/scrubyt/core/scraping/filters/constant_filter.rb
58
+ - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
59
+ - lib/scrubyt/core/scraping/filters/download_filter.rb
60
+ - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
61
+ - lib/scrubyt/core/scraping/filters/regexp_filter.rb
62
+ - lib/scrubyt/core/scraping/filters/script_filter.rb
63
+ - lib/scrubyt/core/scraping/filters/text_filter.rb
64
+ - lib/scrubyt/core/scraping/filters/tree_filter.rb
65
+ - lib/scrubyt/core/scraping/pattern.rb
66
+ - lib/scrubyt/core/scraping/pre_filter_document.rb
67
+ - lib/scrubyt/core/scraping/result_indexer.rb
68
+ - lib/scrubyt/core/shared/extractor.rb
69
+ - lib/scrubyt/logging.rb
70
+ - lib/scrubyt/output/post_processor.rb
71
+ - lib/scrubyt/output/result.rb
72
+ - lib/scrubyt/output/result_dumper.rb
73
+ - lib/scrubyt/output/result_node.rb
74
+ - lib/scrubyt/output/scrubyt_result.rb
75
+ - lib/scrubyt/utils/compound_example_lookup.rb
76
+ - lib/scrubyt/utils/ruby_extensions.rb
77
+ - lib/scrubyt/utils/shared_utils.rb
78
+ - lib/scrubyt/utils/simple_example_lookup.rb
79
+ - lib/scrubyt/utils/xpathutils.rb
80
+ - lib/scrubyt.rb
81
+ has_rdoc: "true"
82
+ homepage: http://www.scrubyt.org
83
+ post_install_message:
84
+ rdoc_options: []
85
+
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ version:
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: "0"
99
+ version:
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.1
104
+ signing_key:
105
+ specification_version: 2
106
+ summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)
107
+ test_files: []
108
+