scrubyt 0.3.4 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +31 -0
- data/README +1 -1
- data/Rakefile +4 -9
- data/lib/scrubyt.rb +37 -56
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
- data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
- data/lib/scrubyt/core/scraping/pattern.rb +6 -27
- data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
- data/lib/scrubyt/core/shared/extractor.rb +15 -1
- data/lib/scrubyt/output/result_node.rb +42 -6
- data/lib/scrubyt/output/scrubyt_result.rb +35 -30
- data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
- data/lib/scrubyt/utils/xpathutils.rb +2 -1
- metadata +84 -119
- data/lib/scrubyt/output/export.rb +0 -157
@@ -1,157 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
# =<tt>exporting previously defined extractors</tt>
|
3
|
-
class Export
|
4
|
-
##
|
5
|
-
#Exports the given extractor (specified by it's root pattern) from the given file
|
6
|
-
#
|
7
|
-
#_input_file_ - the full path of the file where the extractor was defined. This can
|
8
|
-
#be achieved by calling
|
9
|
-
#
|
10
|
-
# pattern.export(__File__)
|
11
|
-
#
|
12
|
-
#from the file of the extractor definition.
|
13
|
-
#
|
14
|
-
#*parameters*
|
15
|
-
#
|
16
|
-
#_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
|
17
|
-
#such a call:
|
18
|
-
#
|
19
|
-
# something = Scrubyt::Extractor.define ...
|
20
|
-
#
|
21
|
-
#However, since the export method should not be called directly (pattern is calling
|
22
|
-
#it), you will probably never need to care about this parameter.
|
23
|
-
#
|
24
|
-
#_output_file_name_ - the name of the file where the exported extractor should be
|
25
|
-
#dumped; From default (i.e. if you don't specify this parameter) this is
|
26
|
-
#"#{wrapper_name}_extractor_export.rb". You may override this setting if specifying
|
27
|
-
#this optional parameter.
|
28
|
-
#
|
29
|
-
#_extractor_result_file_name_ - the name of the file, where the result of the
|
30
|
-
#*exported* extractor should be dumped - for example, if _output_file_name_ is "foo.rb"
|
31
|
-
#and _extractor_result_file_name_ is "bar.xml", the extractor is exported to a file named
|
32
|
-
#"foo.rb", and after running "foo.rb", the results will be dumped to the file "bar.xml"
|
33
|
-
#If this option is not specified, the result is dumped to standard output as XML.
|
34
|
-
#
|
35
|
-
#Examples:
|
36
|
-
#
|
37
|
-
# camera_data = Scrubyt::Extractor.define do
|
38
|
-
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
39
|
-
#
|
40
|
-
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
41
|
-
# end
|
42
|
-
#
|
43
|
-
# camera_data.export(__FILE__)
|
44
|
-
#
|
45
|
-
#This will export this extractor to a file called "camera_data_extractor_export.rb".
|
46
|
-
#If "camera_data_extractor_export.rb" will be executed, the result will be dumped
|
47
|
-
#to the standard output.
|
48
|
-
#
|
49
|
-
#Note that the export method in the last line belongs to the class Scrubyt::Pattern
|
50
|
-
#and not to Scrubyt::Export (i.e. this class). Scrubyt::Pattern.export will call
|
51
|
-
#Scrubyt::Export.export.
|
52
|
-
#
|
53
|
-
# camera_data = Scrubyt::Extractor.define do
|
54
|
-
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
55
|
-
#
|
56
|
-
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
57
|
-
# end
|
58
|
-
#
|
59
|
-
# camera_data.export(__FILE__, 'my_super_camera_extractor.rb', '/home/peter/stuff/result.xml')
|
60
|
-
#
|
61
|
-
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
62
|
-
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
63
|
-
#'/home/peter/stuff/result.xml'.
|
64
|
-
def self.export(root_patterns, data)
|
65
|
-
wrapper_name = data[:wrapper_name]
|
66
|
-
template = data[:template] || 'default'
|
67
|
-
output_file_name = data[:output_file_name]
|
68
|
-
extractor_result_file_name = data[:extractor_result_file_name]
|
69
|
-
|
70
|
-
case template
|
71
|
-
when 'default'
|
72
|
-
header = "require 'rubygems'\n"
|
73
|
-
header += "require 'scrubyt'\n\n"
|
74
|
-
header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
|
75
|
-
if extractor_result_file_name
|
76
|
-
footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
|
77
|
-
else
|
78
|
-
footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
79
|
-
end
|
80
|
-
indent = 1
|
81
|
-
when 'lambda'
|
82
|
-
header = "lambda do\n"
|
83
|
-
footer = "\nend"
|
84
|
-
indent = 1
|
85
|
-
when 'none'
|
86
|
-
header = ''
|
87
|
-
footer = ''
|
88
|
-
indent = 0
|
89
|
-
else
|
90
|
-
puts "Unknown template type: #{template}"
|
91
|
-
return
|
92
|
-
end
|
93
|
-
|
94
|
-
header = data[:header] if data[:header]
|
95
|
-
footer = data[:footer] if data[:footer]
|
96
|
-
indent = data[:indent] if data[:indent]
|
97
|
-
|
98
|
-
sexp = [:block]
|
99
|
-
sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
|
100
|
-
extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
|
101
|
-
extractor = extractor.strip
|
102
|
-
extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
|
103
|
-
|
104
|
-
result = header + extractor + footer
|
105
|
-
|
106
|
-
if output_file_name
|
107
|
-
open(output_file_name, 'w') do |file|
|
108
|
-
file.write(result)
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
result
|
113
|
-
end
|
114
|
-
|
115
|
-
private
|
116
|
-
def self.create_sexp(code)
|
117
|
-
(ParseTree.new.parse_tree_for_string(code))[0]
|
118
|
-
end
|
119
|
-
|
120
|
-
def self.export_extractor(root_patterns, source_proc, wrapper_name)
|
121
|
-
# filter actions before and after pattern
|
122
|
-
pre_pattern_sexp = []
|
123
|
-
post_pattern_sexp = []
|
124
|
-
patterns_passed = false
|
125
|
-
actions = ['next_page', *NavigationActions.instance_methods]
|
126
|
-
|
127
|
-
source_proc.to_sexp[3][1..-1].each do |sexp|
|
128
|
-
get_call = lambda { |sexp|
|
129
|
-
if sexp[0] == :fcall
|
130
|
-
return sexp[1].to_s
|
131
|
-
elsif sexp[0] == :iter || sexp[0] == :call
|
132
|
-
return get_call.call(sexp[1])
|
133
|
-
else
|
134
|
-
return nil
|
135
|
-
end
|
136
|
-
}
|
137
|
-
call = get_call.call(sexp)
|
138
|
-
if(call.nil? || actions.index(call) != nil)
|
139
|
-
if !patterns_passed
|
140
|
-
pre_pattern_sexp.push(sexp)
|
141
|
-
else
|
142
|
-
post_pattern_sexp.push(sexp)
|
143
|
-
end
|
144
|
-
else
|
145
|
-
patterns_passed = true
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# build extractor
|
150
|
-
sexp = [:block]
|
151
|
-
sexp.push([:block, *pre_pattern_sexp])
|
152
|
-
sexp.push([:block, *root_patterns.to_sexp_array])
|
153
|
-
sexp.push([:block, *post_pattern_sexp])
|
154
|
-
sexp
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|