bliss 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +12 -0
- data/Gemfile +2 -2
- data/Gemfile.lock +11 -7
- data/Rakefile +7 -1
- data/VERSION +1 -1
- data/bliss.gemspec +18 -10
- data/complete_test.rb +74 -0
- data/gzip_support.rb +41 -0
- data/http-machine.rb +2 -3
- data/lib/bliss.rb +5 -1
- data/lib/bliss/constraint.rb +78 -0
- data/lib/bliss/encoding_error.rb +4 -0
- data/lib/bliss/format.rb +97 -0
- data/lib/bliss/parser.rb +208 -3
- data/lib/bliss/parser_machine.rb +93 -123
- data/lib/hash_extension.rb +16 -0
- data/spec.yml +23 -0
- data/spec/format_spec.rb +63 -0
- data/spec/spec_helper.rb +16 -0
- data/test.rb +44 -2
- metadata +31 -23
- data/lib/bliss/sax_parser.rb +0 -122
data/lib/bliss/parser.rb
CHANGED
@@ -1,12 +1,217 @@
|
|
1
1
|
module Bliss
|
2
2
|
class Parser
|
3
|
-
def initialize(path)
|
3
|
+
def initialize(path, filepath=nil)
|
4
4
|
@path = path
|
5
|
-
|
5
|
+
|
6
|
+
@parser_machine = Bliss::ParserMachine.new
|
7
|
+
|
8
|
+
@push_parser = Nokogiri::XML::SAX::PushParser.new(@parser_machine)
|
9
|
+
|
10
|
+
if filepath
|
11
|
+
@file = File.new(filepath, 'w')
|
12
|
+
@file.autoclose = false
|
13
|
+
end
|
14
|
+
|
15
|
+
@root = nil
|
16
|
+
@nodes = nil
|
17
|
+
|
18
|
+
on_root {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def on_root(&block)
|
22
|
+
return false if not block.is_a? Proc
|
23
|
+
@parser_machine.on_root { |root|
|
24
|
+
@root = root
|
25
|
+
block.call(root)
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def on_tag_open(element='default', &block)
|
30
|
+
return false if block.arity != 1
|
31
|
+
|
32
|
+
overriden_block = Proc.new { |depth|
|
33
|
+
if not element == 'default'
|
34
|
+
reset_unhandled_bytes
|
35
|
+
end
|
36
|
+
block.call(depth)
|
37
|
+
}
|
38
|
+
@parser_machine.on_tag_open(element, overriden_block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def on_tag_close(element='default', &block)
|
42
|
+
overriden_block = Proc.new { |hash, depth|
|
43
|
+
#if not element == 'default'
|
44
|
+
reset_unhandled_bytes
|
45
|
+
#end
|
46
|
+
block.call(hash, depth)
|
47
|
+
}
|
48
|
+
@parser_machine.on_tag_close(element, overriden_block)
|
49
|
+
end
|
50
|
+
|
51
|
+
def on_max_unhandled_bytes(bytes, &block)
|
52
|
+
@max_unhandled_bytes = bytes
|
53
|
+
@on_max_unhandled_bytes = block
|
54
|
+
end
|
55
|
+
|
56
|
+
def wait_tag_close(element)
|
57
|
+
@wait_tag_close = "</#{element}>"
|
58
|
+
end
|
59
|
+
|
60
|
+
def reset_unhandled_bytes
|
61
|
+
return false if not check_unhandled_bytes?
|
62
|
+
@unhandled_bytes = 0
|
63
|
+
end
|
64
|
+
|
65
|
+
def check_unhandled_bytes
|
66
|
+
if @unhandled_bytes > @max_unhandled_bytes
|
67
|
+
if @on_max_unhandled_bytes
|
68
|
+
@on_max_unhandled_bytes.call
|
69
|
+
@on_max_unhandled_bytes = nil
|
70
|
+
end
|
71
|
+
#self.close
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def exceeded?
|
76
|
+
return false if not check_unhandled_bytes?
|
77
|
+
if @unhandled_bytes > @max_unhandled_bytes
|
78
|
+
return true
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def check_unhandled_bytes?
|
83
|
+
@max_unhandled_bytes ? true : false
|
84
|
+
end
|
85
|
+
|
86
|
+
def root
|
87
|
+
@root
|
88
|
+
end
|
89
|
+
|
90
|
+
def close
|
91
|
+
@parser_machine.close
|
6
92
|
end
|
7
93
|
|
8
94
|
def parse
|
9
|
-
|
95
|
+
reset_unhandled_bytes if check_unhandled_bytes?
|
96
|
+
|
97
|
+
EM.run do
|
98
|
+
http = EM::HttpRequest.new(@path).get
|
99
|
+
|
100
|
+
@autodetect_compression = true
|
101
|
+
compression = :none
|
102
|
+
if @autodetect_compression
|
103
|
+
http.headers do
|
104
|
+
if (/^attachment.+filename.+\.gz/i === http.response_header['CONTENT_DISPOSITION']) or http.response_header.compressed? or ["application/octet-stream", "application/x-gzip"].include? http.response_header['CONTENT_TYPE']
|
105
|
+
@zstream = Zlib::Inflate.new(Zlib::MAX_WBITS+16)
|
106
|
+
compression = :gzip
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
http.stream { |chunk|
|
112
|
+
if chunk
|
113
|
+
chunk.force_encoding('UTF-8')
|
114
|
+
|
115
|
+
if check_unhandled_bytes?
|
116
|
+
@unhandled_bytes += chunk.length
|
117
|
+
check_unhandled_bytes
|
118
|
+
end
|
119
|
+
if not @parser_machine.is_closed?
|
120
|
+
begin
|
121
|
+
case compression
|
122
|
+
when :gzip
|
123
|
+
chunk = @zstream.inflate(chunk)
|
124
|
+
chunk.force_encoding('UTF-8')
|
125
|
+
end
|
126
|
+
@push_parser << chunk
|
127
|
+
if @file
|
128
|
+
@file << chunk
|
129
|
+
end
|
130
|
+
rescue Nokogiri::XML::SyntaxError => e
|
131
|
+
#puts 'encoding error'
|
132
|
+
if e.message.include?("encoding")
|
133
|
+
raise Bliss::EncodingError, "Wrong encoding given"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
else
|
138
|
+
if exceeded?
|
139
|
+
#puts 'exceeded'
|
140
|
+
secure_close
|
141
|
+
else
|
142
|
+
if @file
|
143
|
+
if @wait_tag_close
|
144
|
+
#puts 'handle wait'
|
145
|
+
handle_wait_tag_close(chunk) #if @wait_tag_close
|
146
|
+
else
|
147
|
+
#puts 'secure close'
|
148
|
+
secure_close
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
}
|
155
|
+
http.errback {
|
156
|
+
#puts 'errback'
|
157
|
+
secure_close
|
158
|
+
}
|
159
|
+
http.callback {
|
160
|
+
#if @file
|
161
|
+
# @file.close
|
162
|
+
#end
|
163
|
+
#EM.stop
|
164
|
+
secure_close
|
165
|
+
}
|
166
|
+
end
|
167
|
+
file_close
|
10
168
|
end
|
169
|
+
|
170
|
+
def autodetect_compression(http)
|
171
|
+
#compression = :none
|
172
|
+
puts compression
|
173
|
+
return compression
|
174
|
+
end
|
175
|
+
|
176
|
+
def handle_wait_tag_close(chunk)
|
177
|
+
begin
|
178
|
+
last_index = chunk.index(@wait_tag_close)
|
179
|
+
if last_index
|
180
|
+
last_index += 4
|
181
|
+
@file << chunk[0..last_index]
|
182
|
+
@file << "</#{self.root}>" # TODO set this by using actual depth, so all tags get closed
|
183
|
+
secure_close
|
184
|
+
else
|
185
|
+
@file << chunk
|
186
|
+
end
|
187
|
+
rescue
|
188
|
+
secure_close
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def file_close
|
193
|
+
if @file
|
194
|
+
@file.close
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def secure_close
|
199
|
+
begin
|
200
|
+
if @zstream
|
201
|
+
@zstream.close
|
202
|
+
end
|
203
|
+
rescue
|
204
|
+
ensure
|
205
|
+
EM.stop
|
206
|
+
#puts "Closed secure."
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
11
210
|
end
|
12
211
|
end
|
212
|
+
|
213
|
+
#require 'stringio'
|
214
|
+
#str = StringIO.new
|
215
|
+
#z = Zlib::GzipWriter.new(str)
|
216
|
+
#z.write(txt)
|
217
|
+
#z.close
|
data/lib/bliss/parser_machine.rb
CHANGED
@@ -1,161 +1,131 @@
|
|
1
1
|
module Bliss
|
2
|
-
class ParserMachine
|
3
|
-
|
2
|
+
class ParserMachine < Nokogiri::XML::SAX::Document
|
3
|
+
def initialize
|
4
|
+
@depth = []
|
5
|
+
# @settings = {} # downcased
|
4
6
|
|
5
|
-
|
6
|
-
@
|
7
|
-
|
8
|
-
@sax_parser = Bliss::SaxParser.new
|
7
|
+
@root = nil
|
8
|
+
@nodes = {}
|
9
|
+
@current_node = {}
|
9
10
|
|
10
|
-
@
|
11
|
+
@on_root = nil
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
end
|
13
|
+
@on_tag_open = {}
|
14
|
+
@on_tag_close = {}
|
15
15
|
|
16
|
-
@
|
17
|
-
@nodes = nil
|
16
|
+
@closed = false
|
18
17
|
|
19
|
-
on_root {}
|
20
18
|
end
|
21
19
|
|
22
20
|
def on_root(&block)
|
23
|
-
|
24
|
-
@sax_parser.on_root { |root|
|
25
|
-
@root = root
|
26
|
-
block.call(root)
|
27
|
-
}
|
21
|
+
@on_root = block
|
28
22
|
end
|
29
23
|
|
30
|
-
def on_tag_open(element,
|
31
|
-
|
32
|
-
|
33
|
-
overriden_block = Proc.new { |depth|
|
34
|
-
reset_unhandled_bytes
|
35
|
-
block.call(depth)
|
36
|
-
}
|
37
|
-
@sax_parser.on_tag_open(element, overriden_block)
|
24
|
+
def on_tag_open(element, block)
|
25
|
+
@on_tag_open.merge!({element => block})
|
38
26
|
end
|
39
27
|
|
40
|
-
def on_tag_close(element,
|
41
|
-
|
42
|
-
reset_unhandled_bytes
|
43
|
-
block.call(hash)
|
44
|
-
}
|
45
|
-
@sax_parser.on_tag_close(element, overriden_block)
|
28
|
+
def on_tag_close(element, block)
|
29
|
+
@on_tag_close.merge!({element => block})
|
46
30
|
end
|
47
31
|
|
48
|
-
def
|
49
|
-
@
|
32
|
+
def close
|
33
|
+
@closed = true
|
50
34
|
end
|
51
35
|
|
52
|
-
def
|
53
|
-
|
54
|
-
@unhandled_bytes = 0
|
36
|
+
def is_closed?
|
37
|
+
@closed
|
55
38
|
end
|
56
39
|
|
57
|
-
def
|
58
|
-
if
|
59
|
-
|
40
|
+
def start_element(element, attributes)
|
41
|
+
return if is_closed?
|
42
|
+
# element_transformation
|
43
|
+
|
44
|
+
if @root == nil
|
45
|
+
@root = element
|
46
|
+
if @on_root.is_a? Proc
|
47
|
+
@on_root.call(@root)
|
48
|
+
end
|
60
49
|
end
|
61
|
-
end
|
62
50
|
|
63
|
-
|
64
|
-
|
65
|
-
if @
|
66
|
-
|
51
|
+
@depth.push(element) if @depth.last != element
|
52
|
+
|
53
|
+
if @on_tag_open.has_key? element
|
54
|
+
@on_tag_open[element].call(@depth)
|
55
|
+
elsif @on_tag_open.has_key? 'default'
|
56
|
+
@on_tag_open['default'].call(@depth)
|
67
57
|
end
|
68
|
-
end
|
69
58
|
|
70
|
-
|
71
|
-
@max_unhandled_bytes ? true : false
|
72
|
-
end
|
59
|
+
current = @nodes.pair_at_chain(@depth)
|
73
60
|
|
74
|
-
|
75
|
-
|
61
|
+
value_at = @nodes.value_at_chain(@depth)
|
62
|
+
|
63
|
+
if current.is_a? Hash
|
64
|
+
if value_at.is_a? NilClass
|
65
|
+
current[element] = {}
|
66
|
+
elsif value_at.is_a? Hash
|
67
|
+
if current[element].is_a? Array
|
68
|
+
current[element].concat [{}]
|
69
|
+
else
|
70
|
+
current[element] = [current[element], {}]
|
71
|
+
#current = @nodes.pair_at_chain(@depth)
|
72
|
+
end
|
73
|
+
elsif value_at.is_a? Array
|
74
|
+
#puts @depth.inspect
|
75
|
+
#puts current[element].inspect
|
76
|
+
#puts current[element].inspect
|
77
|
+
end
|
78
|
+
elsif current.is_a? Array
|
79
|
+
end
|
80
|
+
|
81
|
+
@current_content = ''
|
76
82
|
end
|
77
83
|
|
78
|
-
def
|
79
|
-
|
84
|
+
def characters(string)
|
85
|
+
return if is_closed?
|
86
|
+
concat_content(string)
|
80
87
|
end
|
81
88
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
EM.run do
|
86
|
-
http = EM::HttpRequest.new(@path).get
|
87
|
-
http.stream { |chunk|
|
88
|
-
if chunk
|
89
|
-
chunk.force_encoding('UTF-8')
|
90
|
-
|
91
|
-
@parser << chunk
|
92
|
-
|
93
|
-
if check_unhandled_bytes?
|
94
|
-
@unhandled_bytes += chunk.length
|
95
|
-
check_unhandled_bytes
|
96
|
-
end
|
97
|
-
|
98
|
-
if not @sax_parser.is_closed?
|
99
|
-
if @file
|
100
|
-
@file << chunk
|
101
|
-
end
|
102
|
-
else
|
103
|
-
if exceeded?
|
104
|
-
#puts 'exceeded'
|
105
|
-
secure_close
|
106
|
-
else
|
107
|
-
if @file
|
108
|
-
if @wait_tag_close
|
109
|
-
#puts 'handle wait'
|
110
|
-
handle_wait_tag_close(chunk) #if @wait_tag_close
|
111
|
-
else
|
112
|
-
#puts 'secure close'
|
113
|
-
secure_close
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
}
|
120
|
-
http.callback {
|
121
|
-
if @file
|
122
|
-
@file.close
|
123
|
-
end
|
124
|
-
EM.stop
|
125
|
-
}
|
126
|
-
end
|
89
|
+
def cdata_block(string)
|
90
|
+
return if is_closed?
|
91
|
+
concat_content(string)
|
127
92
|
end
|
128
|
-
|
129
|
-
def
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
93
|
+
|
94
|
+
def end_element(element, attributes=[])
|
95
|
+
return if is_closed?
|
96
|
+
# element_transformation
|
97
|
+
|
98
|
+
current = @nodes.pair_at_chain(@depth)
|
99
|
+
value_at = @nodes.value_at_chain(@depth)
|
100
|
+
|
101
|
+
if value_at.is_a? Hash
|
102
|
+
current[element] = @current_content if @current_content.size > 0
|
103
|
+
elsif value_at.is_a? NilClass
|
104
|
+
if current.is_a? Array
|
105
|
+
current = current.last
|
106
|
+
current[element] = @current_content if @current_content.size > 0
|
139
107
|
end
|
140
|
-
rescue
|
141
|
-
secure_close
|
142
108
|
end
|
109
|
+
@current_content = ''
|
110
|
+
|
111
|
+
if @on_tag_close.has_key? element
|
112
|
+
@on_tag_close[element].call(value_at, @depth)
|
113
|
+
elsif @on_tag_close.has_key? 'default'
|
114
|
+
@on_tag_close['default'].call(value_at, @depth)
|
115
|
+
end
|
116
|
+
|
117
|
+
@depth.pop if @depth.last == element
|
143
118
|
end
|
144
119
|
|
145
|
-
def
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
ensure
|
150
|
-
EM.stop
|
120
|
+
def concat_content(string)
|
121
|
+
string.strip!
|
122
|
+
if string
|
123
|
+
@current_content << string
|
151
124
|
end
|
152
125
|
end
|
153
126
|
|
127
|
+
def end_document
|
128
|
+
puts @nodes.inspect
|
129
|
+
end
|
154
130
|
end
|
155
131
|
end
|
156
|
-
|
157
|
-
#require 'stringio'
|
158
|
-
#str = StringIO.new
|
159
|
-
#z = Zlib::GzipWriter.new(str)
|
160
|
-
#z.write(txt)
|
161
|
-
#z.close
|