scrubyt 0.3.4 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +31 -0
- data/README +1 -1
- data/Rakefile +4 -9
- data/lib/scrubyt.rb +37 -56
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
- data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
- data/lib/scrubyt/core/scraping/pattern.rb +6 -27
- data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
- data/lib/scrubyt/core/shared/extractor.rb +15 -1
- data/lib/scrubyt/output/result_node.rb +42 -6
- data/lib/scrubyt/output/scrubyt_result.rb +35 -30
- data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
- data/lib/scrubyt/utils/xpathutils.rb +2 -1
- metadata +84 -119
- data/lib/scrubyt/output/export.rb +0 -157
@@ -1,157 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
# =<tt>exporting previously defined extractors</tt>
|
3
|
-
class Export
|
4
|
-
##
|
5
|
-
#Exports the given extractor (specified by it's root pattern) from the given file
|
6
|
-
#
|
7
|
-
#_input_file_ - the full path of the file where the extractor was defined. This can
|
8
|
-
#be achieved by calling
|
9
|
-
#
|
10
|
-
# pattern.export(__File__)
|
11
|
-
#
|
12
|
-
#from the file of the extractor definition.
|
13
|
-
#
|
14
|
-
#*parameters*
|
15
|
-
#
|
16
|
-
#_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
|
17
|
-
#such a call:
|
18
|
-
#
|
19
|
-
# something = Scrubyt::Extractor.define ...
|
20
|
-
#
|
21
|
-
#However, since the export method should not be called directly (pattern is calling
|
22
|
-
#it), you will probably never need to care about this parameter.
|
23
|
-
#
|
24
|
-
#_output_file_name_ - the name of the file where the exported extractor should be
|
25
|
-
#dumped; From default (i.e. if you don't specify this parameter) this is
|
26
|
-
#"#{wrapper_name}_extractor_export.rb". You may override this setting if specifying
|
27
|
-
#this optional parameter.
|
28
|
-
#
|
29
|
-
#_extractor_result_file_name_ - the name of the file, where the result of the
|
30
|
-
#*exported* extractor should be dumped - for example, if _output_file_name_ is "foo.rb"
|
31
|
-
#and _extractor_result_file_name_ is "bar.xml", the extractor is exported to a file named
|
32
|
-
#"foo.rb", and after running "foo.rb", the results will be dumped to the file "bar.xml"
|
33
|
-
#If this option is not specified, the result is dumped to standard output as XML.
|
34
|
-
#
|
35
|
-
#Examples:
|
36
|
-
#
|
37
|
-
# camera_data = Scrubyt::Extractor.define do
|
38
|
-
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
39
|
-
#
|
40
|
-
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
41
|
-
# end
|
42
|
-
#
|
43
|
-
# camera_data.export(__FILE__)
|
44
|
-
#
|
45
|
-
#This will export this extractor to a file called "camera_data_extractor_export.rb".
|
46
|
-
#If "camera_data_extractor_export.rb" will be executed, the result will be dumped
|
47
|
-
#to the standard output.
|
48
|
-
#
|
49
|
-
#Note that the export method in the last line belongs to the class Scrubyt::Pattern
|
50
|
-
#and not to Scrubyt::Export (i.e. this class). Scrubyt::Pattern.export will call
|
51
|
-
#Scrubyt::Export.export.
|
52
|
-
#
|
53
|
-
# camera_data = Scrubyt::Extractor.define do
|
54
|
-
# Action.fetch File.join(File.dirname(__FILE__), "input.html")
|
55
|
-
#
|
56
|
-
# P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
|
57
|
-
# end
|
58
|
-
#
|
59
|
-
# camera_data.export(__FILE__, 'my_super_camera_extractor.rb', '/home/peter/stuff/result.xml')
|
60
|
-
#
|
61
|
-
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
62
|
-
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
63
|
-
#'/home/peter/stuff/result.xml'.
|
64
|
-
def self.export(root_patterns, data)
|
65
|
-
wrapper_name = data[:wrapper_name]
|
66
|
-
template = data[:template] || 'default'
|
67
|
-
output_file_name = data[:output_file_name]
|
68
|
-
extractor_result_file_name = data[:extractor_result_file_name]
|
69
|
-
|
70
|
-
case template
|
71
|
-
when 'default'
|
72
|
-
header = "require 'rubygems'\n"
|
73
|
-
header += "require 'scrubyt'\n\n"
|
74
|
-
header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
|
75
|
-
if extractor_result_file_name
|
76
|
-
footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
|
77
|
-
else
|
78
|
-
footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
79
|
-
end
|
80
|
-
indent = 1
|
81
|
-
when 'lambda'
|
82
|
-
header = "lambda do\n"
|
83
|
-
footer = "\nend"
|
84
|
-
indent = 1
|
85
|
-
when 'none'
|
86
|
-
header = ''
|
87
|
-
footer = ''
|
88
|
-
indent = 0
|
89
|
-
else
|
90
|
-
puts "Unknown template type: #{template}"
|
91
|
-
return
|
92
|
-
end
|
93
|
-
|
94
|
-
header = data[:header] if data[:header]
|
95
|
-
footer = data[:footer] if data[:footer]
|
96
|
-
indent = data[:indent] if data[:indent]
|
97
|
-
|
98
|
-
sexp = [:block]
|
99
|
-
sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
|
100
|
-
extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
|
101
|
-
extractor = extractor.strip
|
102
|
-
extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
|
103
|
-
|
104
|
-
result = header + extractor + footer
|
105
|
-
|
106
|
-
if output_file_name
|
107
|
-
open(output_file_name, 'w') do |file|
|
108
|
-
file.write(result)
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
result
|
113
|
-
end
|
114
|
-
|
115
|
-
private
|
116
|
-
def self.create_sexp(code)
|
117
|
-
(ParseTree.new.parse_tree_for_string(code))[0]
|
118
|
-
end
|
119
|
-
|
120
|
-
def self.export_extractor(root_patterns, source_proc, wrapper_name)
|
121
|
-
# filter actions before and after pattern
|
122
|
-
pre_pattern_sexp = []
|
123
|
-
post_pattern_sexp = []
|
124
|
-
patterns_passed = false
|
125
|
-
actions = ['next_page', *NavigationActions.instance_methods]
|
126
|
-
|
127
|
-
source_proc.to_sexp[3][1..-1].each do |sexp|
|
128
|
-
get_call = lambda { |sexp|
|
129
|
-
if sexp[0] == :fcall
|
130
|
-
return sexp[1].to_s
|
131
|
-
elsif sexp[0] == :iter || sexp[0] == :call
|
132
|
-
return get_call.call(sexp[1])
|
133
|
-
else
|
134
|
-
return nil
|
135
|
-
end
|
136
|
-
}
|
137
|
-
call = get_call.call(sexp)
|
138
|
-
if(call.nil? || actions.index(call) != nil)
|
139
|
-
if !patterns_passed
|
140
|
-
pre_pattern_sexp.push(sexp)
|
141
|
-
else
|
142
|
-
post_pattern_sexp.push(sexp)
|
143
|
-
end
|
144
|
-
else
|
145
|
-
patterns_passed = true
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# build extractor
|
150
|
-
sexp = [:block]
|
151
|
-
sexp.push([:block, *pre_pattern_sexp])
|
152
|
-
sexp.push([:block, *root_patterns.to_sexp_array])
|
153
|
-
sexp.push([:block, *post_pattern_sexp])
|
154
|
-
sexp
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|