bliss 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +12 -0
- data/Gemfile +2 -2
- data/Gemfile.lock +11 -7
- data/Rakefile +7 -1
- data/VERSION +1 -1
- data/bliss.gemspec +18 -10
- data/complete_test.rb +74 -0
- data/gzip_support.rb +41 -0
- data/http-machine.rb +2 -3
- data/lib/bliss.rb +5 -1
- data/lib/bliss/constraint.rb +78 -0
- data/lib/bliss/encoding_error.rb +4 -0
- data/lib/bliss/format.rb +97 -0
- data/lib/bliss/parser.rb +208 -3
- data/lib/bliss/parser_machine.rb +93 -123
- data/lib/hash_extension.rb +16 -0
- data/spec.yml +23 -0
- data/spec/format_spec.rb +63 -0
- data/spec/spec_helper.rb +16 -0
- data/test.rb +44 -2
- metadata +31 -23
- data/lib/bliss/sax_parser.rb +0 -122
data/lib/bliss/parser.rb
CHANGED
@@ -1,12 +1,217 @@
|
|
1
1
|
module Bliss
|
2
2
|
class Parser
|
3
|
-
def initialize(path)
|
3
|
+
def initialize(path, filepath=nil)
|
4
4
|
@path = path
|
5
|
-
|
5
|
+
|
6
|
+
@parser_machine = Bliss::ParserMachine.new
|
7
|
+
|
8
|
+
@push_parser = Nokogiri::XML::SAX::PushParser.new(@parser_machine)
|
9
|
+
|
10
|
+
if filepath
|
11
|
+
@file = File.new(filepath, 'w')
|
12
|
+
@file.autoclose = false
|
13
|
+
end
|
14
|
+
|
15
|
+
@root = nil
|
16
|
+
@nodes = nil
|
17
|
+
|
18
|
+
on_root {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def on_root(&block)
|
22
|
+
return false if not block.is_a? Proc
|
23
|
+
@parser_machine.on_root { |root|
|
24
|
+
@root = root
|
25
|
+
block.call(root)
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def on_tag_open(element='default', &block)
|
30
|
+
return false if block.arity != 1
|
31
|
+
|
32
|
+
overriden_block = Proc.new { |depth|
|
33
|
+
if not element == 'default'
|
34
|
+
reset_unhandled_bytes
|
35
|
+
end
|
36
|
+
block.call(depth)
|
37
|
+
}
|
38
|
+
@parser_machine.on_tag_open(element, overriden_block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def on_tag_close(element='default', &block)
|
42
|
+
overriden_block = Proc.new { |hash, depth|
|
43
|
+
#if not element == 'default'
|
44
|
+
reset_unhandled_bytes
|
45
|
+
#end
|
46
|
+
block.call(hash, depth)
|
47
|
+
}
|
48
|
+
@parser_machine.on_tag_close(element, overriden_block)
|
49
|
+
end
|
50
|
+
|
51
|
+
def on_max_unhandled_bytes(bytes, &block)
|
52
|
+
@max_unhandled_bytes = bytes
|
53
|
+
@on_max_unhandled_bytes = block
|
54
|
+
end
|
55
|
+
|
56
|
+
def wait_tag_close(element)
|
57
|
+
@wait_tag_close = "</#{element}>"
|
58
|
+
end
|
59
|
+
|
60
|
+
def reset_unhandled_bytes
|
61
|
+
return false if not check_unhandled_bytes?
|
62
|
+
@unhandled_bytes = 0
|
63
|
+
end
|
64
|
+
|
65
|
+
def check_unhandled_bytes
|
66
|
+
if @unhandled_bytes > @max_unhandled_bytes
|
67
|
+
if @on_max_unhandled_bytes
|
68
|
+
@on_max_unhandled_bytes.call
|
69
|
+
@on_max_unhandled_bytes = nil
|
70
|
+
end
|
71
|
+
#self.close
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def exceeded?
|
76
|
+
return false if not check_unhandled_bytes?
|
77
|
+
if @unhandled_bytes > @max_unhandled_bytes
|
78
|
+
return true
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def check_unhandled_bytes?
|
83
|
+
@max_unhandled_bytes ? true : false
|
84
|
+
end
|
85
|
+
|
86
|
+
def root
|
87
|
+
@root
|
88
|
+
end
|
89
|
+
|
90
|
+
def close
|
91
|
+
@parser_machine.close
|
6
92
|
end
|
7
93
|
|
8
94
|
def parse
|
9
|
-
|
95
|
+
reset_unhandled_bytes if check_unhandled_bytes?
|
96
|
+
|
97
|
+
EM.run do
|
98
|
+
http = EM::HttpRequest.new(@path).get
|
99
|
+
|
100
|
+
@autodetect_compression = true
|
101
|
+
compression = :none
|
102
|
+
if @autodetect_compression
|
103
|
+
http.headers do
|
104
|
+
if (/^attachment.+filename.+\.gz/i === http.response_header['CONTENT_DISPOSITION']) or http.response_header.compressed? or ["application/octet-stream", "application/x-gzip"].include? http.response_header['CONTENT_TYPE']
|
105
|
+
@zstream = Zlib::Inflate.new(Zlib::MAX_WBITS+16)
|
106
|
+
compression = :gzip
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
http.stream { |chunk|
|
112
|
+
if chunk
|
113
|
+
chunk.force_encoding('UTF-8')
|
114
|
+
|
115
|
+
if check_unhandled_bytes?
|
116
|
+
@unhandled_bytes += chunk.length
|
117
|
+
check_unhandled_bytes
|
118
|
+
end
|
119
|
+
if not @parser_machine.is_closed?
|
120
|
+
begin
|
121
|
+
case compression
|
122
|
+
when :gzip
|
123
|
+
chunk = @zstream.inflate(chunk)
|
124
|
+
chunk.force_encoding('UTF-8')
|
125
|
+
end
|
126
|
+
@push_parser << chunk
|
127
|
+
if @file
|
128
|
+
@file << chunk
|
129
|
+
end
|
130
|
+
rescue Nokogiri::XML::SyntaxError => e
|
131
|
+
#puts 'encoding error'
|
132
|
+
if e.message.include?("encoding")
|
133
|
+
raise Bliss::EncodingError, "Wrong encoding given"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
else
|
138
|
+
if exceeded?
|
139
|
+
#puts 'exceeded'
|
140
|
+
secure_close
|
141
|
+
else
|
142
|
+
if @file
|
143
|
+
if @wait_tag_close
|
144
|
+
#puts 'handle wait'
|
145
|
+
handle_wait_tag_close(chunk) #if @wait_tag_close
|
146
|
+
else
|
147
|
+
#puts 'secure close'
|
148
|
+
secure_close
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
}
|
155
|
+
http.errback {
|
156
|
+
#puts 'errback'
|
157
|
+
secure_close
|
158
|
+
}
|
159
|
+
http.callback {
|
160
|
+
#if @file
|
161
|
+
# @file.close
|
162
|
+
#end
|
163
|
+
#EM.stop
|
164
|
+
secure_close
|
165
|
+
}
|
166
|
+
end
|
167
|
+
file_close
|
10
168
|
end
|
169
|
+
|
170
|
+
def autodetect_compression(http)
|
171
|
+
#compression = :none
|
172
|
+
puts compression
|
173
|
+
return compression
|
174
|
+
end
|
175
|
+
|
176
|
+
def handle_wait_tag_close(chunk)
|
177
|
+
begin
|
178
|
+
last_index = chunk.index(@wait_tag_close)
|
179
|
+
if last_index
|
180
|
+
last_index += 4
|
181
|
+
@file << chunk[0..last_index]
|
182
|
+
@file << "</#{self.root}>" # TODO set this by using actual depth, so all tags get closed
|
183
|
+
secure_close
|
184
|
+
else
|
185
|
+
@file << chunk
|
186
|
+
end
|
187
|
+
rescue
|
188
|
+
secure_close
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def file_close
|
193
|
+
if @file
|
194
|
+
@file.close
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def secure_close
|
199
|
+
begin
|
200
|
+
if @zstream
|
201
|
+
@zstream.close
|
202
|
+
end
|
203
|
+
rescue
|
204
|
+
ensure
|
205
|
+
EM.stop
|
206
|
+
#puts "Closed secure."
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
11
210
|
end
|
12
211
|
end
|
212
|
+
|
213
|
+
#require 'stringio'
|
214
|
+
#str = StringIO.new
|
215
|
+
#z = Zlib::GzipWriter.new(str)
|
216
|
+
#z.write(txt)
|
217
|
+
#z.close
|
data/lib/bliss/parser_machine.rb
CHANGED
@@ -1,161 +1,131 @@
|
|
1
1
|
module Bliss
|
2
|
-
class ParserMachine
|
3
|
-
|
2
|
+
class ParserMachine < Nokogiri::XML::SAX::Document
|
3
|
+
def initialize
|
4
|
+
@depth = []
|
5
|
+
# @settings = {} # downcased
|
4
6
|
|
5
|
-
|
6
|
-
@
|
7
|
-
|
8
|
-
@sax_parser = Bliss::SaxParser.new
|
7
|
+
@root = nil
|
8
|
+
@nodes = {}
|
9
|
+
@current_node = {}
|
9
10
|
|
10
|
-
@
|
11
|
+
@on_root = nil
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
end
|
13
|
+
@on_tag_open = {}
|
14
|
+
@on_tag_close = {}
|
15
15
|
|
16
|
-
@
|
17
|
-
@nodes = nil
|
16
|
+
@closed = false
|
18
17
|
|
19
|
-
on_root {}
|
20
18
|
end
|
21
19
|
|
22
20
|
def on_root(&block)
|
23
|
-
|
24
|
-
@sax_parser.on_root { |root|
|
25
|
-
@root = root
|
26
|
-
block.call(root)
|
27
|
-
}
|
21
|
+
@on_root = block
|
28
22
|
end
|
29
23
|
|
30
|
-
def on_tag_open(element,
|
31
|
-
|
32
|
-
|
33
|
-
overriden_block = Proc.new { |depth|
|
34
|
-
reset_unhandled_bytes
|
35
|
-
block.call(depth)
|
36
|
-
}
|
37
|
-
@sax_parser.on_tag_open(element, overriden_block)
|
24
|
+
def on_tag_open(element, block)
|
25
|
+
@on_tag_open.merge!({element => block})
|
38
26
|
end
|
39
27
|
|
40
|
-
def on_tag_close(element,
|
41
|
-
|
42
|
-
reset_unhandled_bytes
|
43
|
-
block.call(hash)
|
44
|
-
}
|
45
|
-
@sax_parser.on_tag_close(element, overriden_block)
|
28
|
+
def on_tag_close(element, block)
|
29
|
+
@on_tag_close.merge!({element => block})
|
46
30
|
end
|
47
31
|
|
48
|
-
def
|
49
|
-
@
|
32
|
+
def close
|
33
|
+
@closed = true
|
50
34
|
end
|
51
35
|
|
52
|
-
def
|
53
|
-
|
54
|
-
@unhandled_bytes = 0
|
36
|
+
def is_closed?
|
37
|
+
@closed
|
55
38
|
end
|
56
39
|
|
57
|
-
def
|
58
|
-
if
|
59
|
-
|
40
|
+
def start_element(element, attributes)
|
41
|
+
return if is_closed?
|
42
|
+
# element_transformation
|
43
|
+
|
44
|
+
if @root == nil
|
45
|
+
@root = element
|
46
|
+
if @on_root.is_a? Proc
|
47
|
+
@on_root.call(@root)
|
48
|
+
end
|
60
49
|
end
|
61
|
-
end
|
62
50
|
|
63
|
-
|
64
|
-
|
65
|
-
if @
|
66
|
-
|
51
|
+
@depth.push(element) if @depth.last != element
|
52
|
+
|
53
|
+
if @on_tag_open.has_key? element
|
54
|
+
@on_tag_open[element].call(@depth)
|
55
|
+
elsif @on_tag_open.has_key? 'default'
|
56
|
+
@on_tag_open['default'].call(@depth)
|
67
57
|
end
|
68
|
-
end
|
69
58
|
|
70
|
-
|
71
|
-
@max_unhandled_bytes ? true : false
|
72
|
-
end
|
59
|
+
current = @nodes.pair_at_chain(@depth)
|
73
60
|
|
74
|
-
|
75
|
-
|
61
|
+
value_at = @nodes.value_at_chain(@depth)
|
62
|
+
|
63
|
+
if current.is_a? Hash
|
64
|
+
if value_at.is_a? NilClass
|
65
|
+
current[element] = {}
|
66
|
+
elsif value_at.is_a? Hash
|
67
|
+
if current[element].is_a? Array
|
68
|
+
current[element].concat [{}]
|
69
|
+
else
|
70
|
+
current[element] = [current[element], {}]
|
71
|
+
#current = @nodes.pair_at_chain(@depth)
|
72
|
+
end
|
73
|
+
elsif value_at.is_a? Array
|
74
|
+
#puts @depth.inspect
|
75
|
+
#puts current[element].inspect
|
76
|
+
#puts current[element].inspect
|
77
|
+
end
|
78
|
+
elsif current.is_a? Array
|
79
|
+
end
|
80
|
+
|
81
|
+
@current_content = ''
|
76
82
|
end
|
77
83
|
|
78
|
-
def
|
79
|
-
|
84
|
+
def characters(string)
|
85
|
+
return if is_closed?
|
86
|
+
concat_content(string)
|
80
87
|
end
|
81
88
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
EM.run do
|
86
|
-
http = EM::HttpRequest.new(@path).get
|
87
|
-
http.stream { |chunk|
|
88
|
-
if chunk
|
89
|
-
chunk.force_encoding('UTF-8')
|
90
|
-
|
91
|
-
@parser << chunk
|
92
|
-
|
93
|
-
if check_unhandled_bytes?
|
94
|
-
@unhandled_bytes += chunk.length
|
95
|
-
check_unhandled_bytes
|
96
|
-
end
|
97
|
-
|
98
|
-
if not @sax_parser.is_closed?
|
99
|
-
if @file
|
100
|
-
@file << chunk
|
101
|
-
end
|
102
|
-
else
|
103
|
-
if exceeded?
|
104
|
-
#puts 'exceeded'
|
105
|
-
secure_close
|
106
|
-
else
|
107
|
-
if @file
|
108
|
-
if @wait_tag_close
|
109
|
-
#puts 'handle wait'
|
110
|
-
handle_wait_tag_close(chunk) #if @wait_tag_close
|
111
|
-
else
|
112
|
-
#puts 'secure close'
|
113
|
-
secure_close
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
}
|
120
|
-
http.callback {
|
121
|
-
if @file
|
122
|
-
@file.close
|
123
|
-
end
|
124
|
-
EM.stop
|
125
|
-
}
|
126
|
-
end
|
89
|
+
def cdata_block(string)
|
90
|
+
return if is_closed?
|
91
|
+
concat_content(string)
|
127
92
|
end
|
128
|
-
|
129
|
-
def
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
93
|
+
|
94
|
+
def end_element(element, attributes=[])
|
95
|
+
return if is_closed?
|
96
|
+
# element_transformation
|
97
|
+
|
98
|
+
current = @nodes.pair_at_chain(@depth)
|
99
|
+
value_at = @nodes.value_at_chain(@depth)
|
100
|
+
|
101
|
+
if value_at.is_a? Hash
|
102
|
+
current[element] = @current_content if @current_content.size > 0
|
103
|
+
elsif value_at.is_a? NilClass
|
104
|
+
if current.is_a? Array
|
105
|
+
current = current.last
|
106
|
+
current[element] = @current_content if @current_content.size > 0
|
139
107
|
end
|
140
|
-
rescue
|
141
|
-
secure_close
|
142
108
|
end
|
109
|
+
@current_content = ''
|
110
|
+
|
111
|
+
if @on_tag_close.has_key? element
|
112
|
+
@on_tag_close[element].call(value_at, @depth)
|
113
|
+
elsif @on_tag_close.has_key? 'default'
|
114
|
+
@on_tag_close['default'].call(value_at, @depth)
|
115
|
+
end
|
116
|
+
|
117
|
+
@depth.pop if @depth.last == element
|
143
118
|
end
|
144
119
|
|
145
|
-
def
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
ensure
|
150
|
-
EM.stop
|
120
|
+
def concat_content(string)
|
121
|
+
string.strip!
|
122
|
+
if string
|
123
|
+
@current_content << string
|
151
124
|
end
|
152
125
|
end
|
153
126
|
|
127
|
+
def end_document
|
128
|
+
puts @nodes.inspect
|
129
|
+
end
|
154
130
|
end
|
155
131
|
end
|
156
|
-
|
157
|
-
#require 'stringio'
|
158
|
-
#str = StringIO.new
|
159
|
-
#z = Zlib::GzipWriter.new(str)
|
160
|
-
#z.write(txt)
|
161
|
-
#z.close
|