scrubyt 0.3.4 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,157 +0,0 @@
1
- module Scrubyt
2
- # =<tt>exporting previously defined extractors</tt>
3
- class Export
4
- ##
5
- #Exports the given extractor (specified by it's root pattern) from the given file
6
- #
7
- #_input_file_ - the full path of the file where the extractor was defined. This can
8
- #be achieved by calling
9
- #
10
- # pattern.export(__File__)
11
- #
12
- #from the file of the extractor definition.
13
- #
14
- #*parameters*
15
- #
16
- #_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
17
- #such a call:
18
- #
19
- # something = Scrubyt::Extractor.define ...
20
- #
21
- #However, since the export method should not be called directly (pattern is calling
22
- #it), you will probably never need to care about this parameter.
23
- #
24
- #_output_file_name_ - the name of the file where the exported extractor should be
25
- #dumped; From default (i.e. if you don't specify this parameter) this is
26
- #"#{wrapper_name}_extractor_export.rb". You may override this setting if specifying
27
- #this optional parameter.
28
- #
29
- #_extractor_result_file_name_ - the name of the file, where the result of the
30
- #*exported* extractor should be dumped - for example, if _output_file_name_ is "foo.rb"
31
- #and _extractor_result_file_name_ is "bar.xml", the extractor is exported to a file named
32
- #"foo.rb", and after running "foo.rb", the results will be dumped to the file "bar.xml"
33
- #If this option is not specified, the result is dumped to standard output as XML.
34
- #
35
- #Examples:
36
- #
37
- # camera_data = Scrubyt::Extractor.define do
38
- # Action.fetch File.join(File.dirname(__FILE__), "input.html")
39
- #
40
- # P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
41
- # end
42
- #
43
- # camera_data.export(__FILE__)
44
- #
45
- #This will export this extractor to a file called "camera_data_extractor_export.rb".
46
- #If "camera_data_extractor_export.rb" will be executed, the result will be dumped
47
- #to the standard output.
48
- #
49
- #Note that the export method in the last line belongs to the class Scrubyt::Pattern
50
- #and not to Scrubyt::Export (i.e. this class). Scrubyt::Pattern.export will call
51
- #Scrubyt::Export.export.
52
- #
53
- # camera_data = Scrubyt::Extractor.define do
54
- # Action.fetch File.join(File.dirname(__FILE__), "input.html")
55
- #
56
- # P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
57
- # end
58
- #
59
- # camera_data.export(__FILE__, 'my_super_camera_extractor.rb', '/home/peter/stuff/result.xml')
60
- #
61
- #This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
62
- #After running 'my_super_camera_extractor.rb', the result will be dumped to the file
63
- #'/home/peter/stuff/result.xml'.
64
- def self.export(root_patterns, data)
65
- wrapper_name = data[:wrapper_name]
66
- template = data[:template] || 'default'
67
- output_file_name = data[:output_file_name]
68
- extractor_result_file_name = data[:extractor_result_file_name]
69
-
70
- case template
71
- when 'default'
72
- header = "require 'rubygems'\n"
73
- header += "require 'scrubyt'\n\n"
74
- header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
75
- if extractor_result_file_name
76
- footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
77
- else
78
- footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
79
- end
80
- indent = 1
81
- when 'lambda'
82
- header = "lambda do\n"
83
- footer = "\nend"
84
- indent = 1
85
- when 'none'
86
- header = ''
87
- footer = ''
88
- indent = 0
89
- else
90
- puts "Unknown template type: #{template}"
91
- return
92
- end
93
-
94
- header = data[:header] if data[:header]
95
- footer = data[:footer] if data[:footer]
96
- indent = data[:indent] if data[:indent]
97
-
98
- sexp = [:block]
99
- sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
100
- extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
101
- extractor = extractor.strip
102
- extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
103
-
104
- result = header + extractor + footer
105
-
106
- if output_file_name
107
- open(output_file_name, 'w') do |file|
108
- file.write(result)
109
- end
110
- end
111
-
112
- result
113
- end
114
-
115
- private
116
- def self.create_sexp(code)
117
- (ParseTree.new.parse_tree_for_string(code))[0]
118
- end
119
-
120
- def self.export_extractor(root_patterns, source_proc, wrapper_name)
121
- # filter actions before and after pattern
122
- pre_pattern_sexp = []
123
- post_pattern_sexp = []
124
- patterns_passed = false
125
- actions = ['next_page', *NavigationActions.instance_methods]
126
-
127
- source_proc.to_sexp[3][1..-1].each do |sexp|
128
- get_call = lambda { |sexp|
129
- if sexp[0] == :fcall
130
- return sexp[1].to_s
131
- elsif sexp[0] == :iter || sexp[0] == :call
132
- return get_call.call(sexp[1])
133
- else
134
- return nil
135
- end
136
- }
137
- call = get_call.call(sexp)
138
- if(call.nil? || actions.index(call) != nil)
139
- if !patterns_passed
140
- pre_pattern_sexp.push(sexp)
141
- else
142
- post_pattern_sexp.push(sexp)
143
- end
144
- else
145
- patterns_passed = true
146
- end
147
- end
148
-
149
- # build extractor
150
- sexp = [:block]
151
- sexp.push([:block, *pre_pattern_sexp])
152
- sexp.push([:block, *root_patterns.to_sexp_array])
153
- sexp.push([:block, *post_pattern_sexp])
154
- sexp
155
- end
156
- end
157
- end