scrubyt 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,157 +0,0 @@
1
- module Scrubyt
2
- # =<tt>exporting previously defined extractors</tt>
3
- class Export
4
- ##
5
- #Exports the given extractor (specified by it's root pattern) from the given file
6
- #
7
- #_input_file_ - the full path of the file where the extractor was defined. This can
8
- #be achieved by calling
9
- #
10
- # pattern.export(__File__)
11
- #
12
- #from the file of the extractor definition.
13
- #
14
- #*parameters*
15
- #
16
- #_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
17
- #such a call:
18
- #
19
- # something = Scrubyt::Extractor.define ...
20
- #
21
- #However, since the export method should not be called directly (pattern is calling
22
- #it), you will probably never need to care about this parameter.
23
- #
24
- #_output_file_name_ - the name of the file where the exported extractor should be
25
- #dumped; From default (i.e. if you don't specify this parameter) this is
26
- #"#{wrapper_name}_extractor_export.rb". You may override this setting if specifying
27
- #this optional parameter.
28
- #
29
- #_extractor_result_file_name_ - the name of the file, where the result of the
30
- #*exported* extractor should be dumped - for example, if _output_file_name_ is "foo.rb"
31
- #and _extractor_result_file_name_ is "bar.xml", the extractor is exported to a file named
32
- #"foo.rb", and after running "foo.rb", the results will be dumped to the file "bar.xml"
33
- #If this option is not specified, the result is dumped to standard output as XML.
34
- #
35
- #Examples:
36
- #
37
- # camera_data = Scrubyt::Extractor.define do
38
- # Action.fetch File.join(File.dirname(__FILE__), "input.html")
39
- #
40
- # P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
41
- # end
42
- #
43
- # camera_data.export(__FILE__)
44
- #
45
- #This will export this extractor to a file called "camera_data_extractor_export.rb".
46
- #If "camera_data_extractor_export.rb" will be executed, the result will be dumped
47
- #to the standard output.
48
- #
49
- #Note that the export method in the last line belongs to the class Scrubyt::Pattern
50
- #and not to Scrubyt::Export (i.e. this class). Scrubyt::Pattern.export will call
51
- #Scrubyt::Export.export.
52
- #
53
- # camera_data = Scrubyt::Extractor.define do
54
- # Action.fetch File.join(File.dirname(__FILE__), "input.html")
55
- #
56
- # P.item_name "Canon EOS 20D SLR Digital Camera (Lens sold separately)"
57
- # end
58
- #
59
- # camera_data.export(__FILE__, 'my_super_camera_extractor.rb', '/home/peter/stuff/result.xml')
60
- #
61
- #This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
62
- #After running 'my_super_camera_extractor.rb', the result will be dumped to the file
63
- #'/home/peter/stuff/result.xml'.
64
- def self.export(root_patterns, data)
65
- wrapper_name = data[:wrapper_name]
66
- template = data[:template] || 'default'
67
- output_file_name = data[:output_file_name]
68
- extractor_result_file_name = data[:extractor_result_file_name]
69
-
70
- case template
71
- when 'default'
72
- header = "require 'rubygems'\n"
73
- header += "require 'scrubyt'\n\n"
74
- header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
75
- if extractor_result_file_name
76
- footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
77
- else
78
- footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
79
- end
80
- indent = 1
81
- when 'lambda'
82
- header = "lambda do\n"
83
- footer = "\nend"
84
- indent = 1
85
- when 'none'
86
- header = ''
87
- footer = ''
88
- indent = 0
89
- else
90
- puts "Unknown template type: #{template}"
91
- return
92
- end
93
-
94
- header = data[:header] if data[:header]
95
- footer = data[:footer] if data[:footer]
96
- indent = data[:indent] if data[:indent]
97
-
98
- sexp = [:block]
99
- sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
100
- extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
101
- extractor = extractor.strip
102
- extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
103
-
104
- result = header + extractor + footer
105
-
106
- if output_file_name
107
- open(output_file_name, 'w') do |file|
108
- file.write(result)
109
- end
110
- end
111
-
112
- result
113
- end
114
-
115
- private
116
- def self.create_sexp(code)
117
- (ParseTree.new.parse_tree_for_string(code))[0]
118
- end
119
-
120
- def self.export_extractor(root_patterns, source_proc, wrapper_name)
121
- # filter actions before and after pattern
122
- pre_pattern_sexp = []
123
- post_pattern_sexp = []
124
- patterns_passed = false
125
- actions = ['next_page', *NavigationActions.instance_methods]
126
-
127
- source_proc.to_sexp[3][1..-1].each do |sexp|
128
- get_call = lambda { |sexp|
129
- if sexp[0] == :fcall
130
- return sexp[1].to_s
131
- elsif sexp[0] == :iter || sexp[0] == :call
132
- return get_call.call(sexp[1])
133
- else
134
- return nil
135
- end
136
- }
137
- call = get_call.call(sexp)
138
- if(call.nil? || actions.index(call) != nil)
139
- if !patterns_passed
140
- pre_pattern_sexp.push(sexp)
141
- else
142
- post_pattern_sexp.push(sexp)
143
- end
144
- else
145
- patterns_passed = true
146
- end
147
- end
148
-
149
- # build extractor
150
- sexp = [:block]
151
- sexp.push([:block, *pre_pattern_sexp])
152
- sexp.push([:block, *root_patterns.to_sexp_array])
153
- sexp.push([:block, *post_pattern_sexp])
154
- sexp
155
- end
156
- end
157
- end