mortar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. data/README.md +36 -0
  2. data/bin/mortar +13 -0
  3. data/lib/mortar.rb +23 -0
  4. data/lib/mortar/auth.rb +312 -0
  5. data/lib/mortar/cli.rb +54 -0
  6. data/lib/mortar/command.rb +267 -0
  7. data/lib/mortar/command/auth.rb +96 -0
  8. data/lib/mortar/command/base.rb +319 -0
  9. data/lib/mortar/command/clusters.rb +41 -0
  10. data/lib/mortar/command/describe.rb +97 -0
  11. data/lib/mortar/command/generate.rb +121 -0
  12. data/lib/mortar/command/help.rb +166 -0
  13. data/lib/mortar/command/illustrate.rb +97 -0
  14. data/lib/mortar/command/jobs.rb +174 -0
  15. data/lib/mortar/command/pigscripts.rb +45 -0
  16. data/lib/mortar/command/projects.rb +128 -0
  17. data/lib/mortar/command/validate.rb +94 -0
  18. data/lib/mortar/command/version.rb +42 -0
  19. data/lib/mortar/errors.rb +24 -0
  20. data/lib/mortar/generators/generator_base.rb +107 -0
  21. data/lib/mortar/generators/macro_generator.rb +37 -0
  22. data/lib/mortar/generators/pigscript_generator.rb +40 -0
  23. data/lib/mortar/generators/project_generator.rb +67 -0
  24. data/lib/mortar/generators/udf_generator.rb +28 -0
  25. data/lib/mortar/git.rb +233 -0
  26. data/lib/mortar/helpers.rb +488 -0
  27. data/lib/mortar/project.rb +156 -0
  28. data/lib/mortar/snapshot.rb +39 -0
  29. data/lib/mortar/templates/macro/macro.pig +14 -0
  30. data/lib/mortar/templates/pigscript/pigscript.pig +38 -0
  31. data/lib/mortar/templates/pigscript/python_udf.py +13 -0
  32. data/lib/mortar/templates/project/Gemfile +3 -0
  33. data/lib/mortar/templates/project/README.md +8 -0
  34. data/lib/mortar/templates/project/gitignore +4 -0
  35. data/lib/mortar/templates/project/macros/gitkeep +0 -0
  36. data/lib/mortar/templates/project/pigscripts/pigscript.pig +35 -0
  37. data/lib/mortar/templates/project/udfs/python/python_udf.py +13 -0
  38. data/lib/mortar/templates/udf/python_udf.py +13 -0
  39. data/lib/mortar/version.rb +20 -0
  40. data/lib/vendor/mortar/okjson.rb +598 -0
  41. data/lib/vendor/mortar/uuid.rb +312 -0
  42. data/spec/mortar/auth_spec.rb +156 -0
  43. data/spec/mortar/command/auth_spec.rb +46 -0
  44. data/spec/mortar/command/base_spec.rb +82 -0
  45. data/spec/mortar/command/clusters_spec.rb +61 -0
  46. data/spec/mortar/command/describe_spec.rb +135 -0
  47. data/spec/mortar/command/generate_spec.rb +139 -0
  48. data/spec/mortar/command/illustrate_spec.rb +140 -0
  49. data/spec/mortar/command/jobs_spec.rb +364 -0
  50. data/spec/mortar/command/pigscripts_spec.rb +70 -0
  51. data/spec/mortar/command/projects_spec.rb +165 -0
  52. data/spec/mortar/command/validate_spec.rb +119 -0
  53. data/spec/mortar/command_spec.rb +122 -0
  54. data/spec/mortar/git_spec.rb +278 -0
  55. data/spec/mortar/helpers_spec.rb +82 -0
  56. data/spec/mortar/project_spec.rb +76 -0
  57. data/spec/mortar/snapshot_spec.rb +46 -0
  58. data/spec/spec.opts +1 -0
  59. data/spec/spec_helper.rb +278 -0
  60. data/spec/support/display_message_matcher.rb +68 -0
  61. metadata +259 -0
@@ -0,0 +1,156 @@
1
+ #
2
+ # Copyright 2012 Mortar Data Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ require 'fileutils'
18
+
19
+ module Mortar
20
+ module Project
21
+ class ProjectError < RuntimeError; end
22
+
23
+ class Project
24
+ def self.required_directories
25
+ ["macros", "pigscripts", "udfs"]
26
+ end
27
+
28
+ attr_accessor :name
29
+ attr_accessor :remote
30
+ attr_accessor :root_path
31
+
32
+ def initialize(name, root_path, remote)
33
+ @name = name
34
+ @root_path = root_path
35
+ @remote = remote
36
+ end
37
+
38
+ def python_udfs_path
39
+ File.join(@root_path, "udfs/python")
40
+ end
41
+
42
+ def python_udfs
43
+ @python_udfs ||= PythonUDFs.new(
44
+ python_udfs_path,
45
+ "python",
46
+ ".py")
47
+ end
48
+
49
+ def pigscripts_path
50
+ File.join(@root_path, "pigscripts")
51
+ end
52
+
53
+ def pigscripts
54
+ @pigscripts ||= PigScripts.new(
55
+ pigscripts_path,
56
+ "pigscripts",
57
+ ".pig")
58
+ @pigscripts
59
+ end
60
+
61
+ def tmp_path
62
+ path = File.join(@root_path, "tmp")
63
+ unless File.directory? path
64
+ FileUtils.mkdir_p path
65
+ end
66
+ path
67
+ end
68
+ end
69
+
70
+ class ProjectEntity
71
+
72
+ include Enumerable
73
+
74
+ def initialize(path, name, filename_extension)
75
+ @path = path
76
+ @name = name
77
+ @filename_extension = filename_extension
78
+ @elements = elements
79
+ end
80
+
81
+ def method_missing(method, *args)
82
+ method_name = method.to_s
83
+ return @elements[method_name] if @elements[method_name]
84
+ super
85
+ end
86
+
87
+ def each
88
+ @elements.each {|key, value| yield [key, value]}
89
+ end
90
+
91
+ def [](key)
92
+ @elements[key]
93
+ end
94
+
95
+ def keys
96
+ @elements.keys
97
+ end
98
+
99
+ protected
100
+
101
+ def element_name(element_path)
102
+ File.basename(element_path, @filename_extension)
103
+ end
104
+
105
+ def elements
106
+ unless File.directory? @path
107
+ raise ProjectError, "Unable to find #{@name} directory in project"
108
+ end
109
+
110
+ # get {script_name => full_path}
111
+ file_paths = Dir[File.join(@path, "**", "*#{@filename_extension}")]
112
+ file_paths_hsh = file_paths.collect{|element_path| [element_name(element_path), element(element_name(element_path), element_path)]}.flatten
113
+ Hash[*file_paths_hsh]
114
+ end
115
+
116
+ def element(path)
117
+ raise NotImplementedError, "Implement in subclass"
118
+ end
119
+ end
120
+
121
+ class PigScripts < ProjectEntity
122
+ def element(name, path)
123
+ Script.new(name, path)
124
+ end
125
+ end
126
+
127
+ class PythonUDFs < ProjectEntity
128
+ def element(name, path)
129
+ Script.new(name, path)
130
+ end
131
+ end
132
+
133
+ class Script
134
+
135
+ attr_reader :name
136
+ attr_reader :path
137
+
138
+ def initialize(name, path)
139
+ @name = name
140
+ @path = path
141
+ end
142
+
143
+ def code
144
+ script_file = File.open(@path, "r")
145
+ script_contents = script_file.read
146
+ script_file.close
147
+ script_contents
148
+ end
149
+
150
+ def to_s
151
+ code
152
+ end
153
+ end
154
+
155
+ end
156
+ end
@@ -0,0 +1,39 @@
1
+ #
2
+ # Copyright 2012 Mortar Data Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ module Mortar
18
+ module Snapshot
19
+
20
+ extend self
21
+
22
+ def create_and_push_snapshot_branch(git, project)
23
+ # create / push a snapshot branch
24
+ snapshot_branch = action("Taking code snapshot") do
25
+ git.create_snapshot_branch()
26
+ end
27
+
28
+ git_ref = action("Sending code snapshot to Mortar") do
29
+ # push the code
30
+ git.push(project.remote, snapshot_branch)
31
+
32
+ # grab the commit hash and clean out the branch from the local branches
33
+ ref = git.git_ref(snapshot_branch)
34
+ git.branch_delete(snapshot_branch)
35
+ ref
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,14 @@
1
+ /**
2
+ * <%= macro_name %>: Pig macros for use in pigscripts.
3
+ *
4
+ */
5
+
6
+ /**
7
+ * A simple example macro function that returns the entity passed in.
8
+ */
9
+ DEFINE <%= macro_name.capitalize %>_EXAMPLE(input_relation)
10
+ returns output_relation {
11
+ -- just an example
12
+ $output_relation = FOREACH $input_relation
13
+ GENERATE *;
14
+ };
@@ -0,0 +1,38 @@
1
+ /**
2
+ * <%= script_name %>
3
+ *
4
+ * Required parameters:
5
+ *
6
+ * -param INPUT_PATH Input path for script data (e.g. s3n://hawk-example-data/tutorial/excite.log.bz2)
7
+ * -param OUTPUT_PATH Output path for script data (e.g. s3n://my-output-bucket/<%= script_name %>)
8
+ */
9
+
10
+ <% if not options[:skip_udf] %>
11
+ /**
12
+ * User-Defined Functions (UDFs)
13
+ */
14
+
15
+ REGISTER '../udfs/python/<%= script_name %>.py' using streaming_python as <%= script_name %>;
16
+ <% end %>
17
+
18
+ -- This is an example of loading up input data
19
+ my_input_data = LOAD '$INPUT_PATH'
20
+ USING PigStorage('\t')
21
+ AS (field0:chararray, field1:chararray, field2:chararray);
22
+
23
+ -- This is an example pig operation
24
+ filtered = FILTER my_input_data
25
+ BY field0 IS NOT NULL;
26
+
27
+ -- This is an example call to a python user-defined function
28
+ with_udf_output = FOREACH filtered
29
+ GENERATE field0..field2,
30
+ <%= script_name %>.example_udf(field0) AS example_udf_field;
31
+
32
+ -- remove any existing data
33
+ rmf $OUTPUT_PATH;
34
+
35
+ -- store the results
36
+ STORE with_udf_output
37
+ INTO '$OUTPUT_PATH'
38
+ USING PigStorage('\t');
@@ -0,0 +1,13 @@
1
+ from pig_util import outputSchema
2
+
3
+ #
4
+ # This is where we write python UDFs (User-Defined Functions) that we can call from pig.
5
+ # Pig needs to know the schema of the data coming out of the function,
6
+ # which we specify using the @outputSchema decorator.
7
+ #
8
+ @outputSchema('example_udf:int')
9
+ def example_udf(input_str):
10
+ """
11
+ A simple example function that just returns the length of the string passed in.
12
+ """
13
+ return len(input_str) if input_str else None
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "mortar"
@@ -0,0 +1,8 @@
1
+ # Weclome to Mortar!
2
+
3
+ Copy TK
4
+
5
+
6
+ # Getting Started
7
+
8
+ Copy TK
@@ -0,0 +1,4 @@
1
+ *.pyc
2
+ *.class
3
+ *.log
4
+ tmp
File without changes
@@ -0,0 +1,35 @@
1
+ /**
2
+ * <%= project_name %>
3
+ *
4
+ * Required parameters:
5
+ *
6
+ * -param INPUT_PATH Input path for script data (e.g. s3n://hawk-example-data/tutorial/excite.log.bz2)
7
+ * -param OUTPUT_PATH Output path for script data (e.g. s3n://my-output-bucket/<%= project_name %>)
8
+ */
9
+
10
+ /**
11
+ * User-Defined Functions (UDFs)
12
+ */
13
+ REGISTER '../udfs/python/<%= project_name %>.py' using streaming_python as <%= project_name %>;
14
+
15
+ -- This is an example of loading up input data
16
+ my_input_data = LOAD '$INPUT_PATH'
17
+ USING PigStorage('\t')
18
+ AS (field0:chararray, field1:chararray, field2:chararray);
19
+
20
+ -- This is an example pig operation
21
+ filtered = FILTER my_input_data
22
+ BY field0 IS NOT NULL;
23
+
24
+ -- This is an example call to a python user-defined function
25
+ with_udf_output = FOREACH filtered
26
+ GENERATE field0..field2,
27
+ <%= project_name %>.example_udf(field0) AS example_udf_field;
28
+
29
+ -- remove any existing data
30
+ rmf $OUTPUT_PATH;
31
+
32
+ -- store the results
33
+ STORE with_udf_output
34
+ INTO '$OUTPUT_PATH'
35
+ USING PigStorage('\t');
@@ -0,0 +1,13 @@
1
+ from pig_util import outputSchema
2
+
3
+ #
4
+ # This is where we write python UDFs (User-Defined Functions) that we can call from pig.
5
+ # Pig needs to know the schema of the data coming out of the function,
6
+ # which we specify using the @outputSchema decorator.
7
+ #
8
+ @outputSchema('example_udf:int')
9
+ def example_udf(input_str):
10
+ """
11
+ A simple example function that just returns the length of the string passed in.
12
+ """
13
+ return len(input_str) if input_str else None
@@ -0,0 +1,13 @@
1
+ from pig_util import outputSchema
2
+
3
+ #
4
+ # This is where we write python UDFs (User-Defined Functions) that we can call from pig.
5
+ # Pig needs to know the schema of the data coming out of the function,
6
+ # which we specify using the @outputSchema decorator.
7
+ #
8
+ @outputSchema('example_udf:int')
9
+ def example_udf(input_str):
10
+ """
11
+ A simple example function that just returns the length of the string passed in.
12
+ """
13
+ return len(input_str) if input_str else None
@@ -0,0 +1,20 @@
1
+ #
2
+ # Copyright 2012 Mortar Data Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ module Mortar
18
+ # see http://semver.org/
19
+ VERSION = "0.1.0"
20
+ end
@@ -0,0 +1,598 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Copyright 2011, 2012 Keith Rarick
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ # See https://github.com/kr/okjson for updates.
24
+
25
+ require 'stringio'
26
+
27
+ # Some parts adapted from
28
+ # http://golang.org/src/pkg/json/decode.go and
29
+ # http://golang.org/src/pkg/utf8/utf8.go
30
+ module Mortar
31
+ module OkJson
32
+ extend self
33
+
34
+
35
+ # Decodes a json document in string s and
36
+ # returns the corresponding ruby value.
37
+ # String s must be valid UTF-8. If you have
38
+ # a string in some other encoding, convert
39
+ # it first.
40
+ #
41
+ # String values in the resulting structure
42
+ # will be UTF-8.
43
+ def decode(s)
44
+ ts = lex(s)
45
+ v, ts = textparse(ts)
46
+ if ts.length > 0
47
+ raise Error, 'trailing garbage'
48
+ end
49
+ v
50
+ end
51
+
52
+
53
+ # Parses a "json text" in the sense of RFC 4627.
54
+ # Returns the parsed value and any trailing tokens.
55
+ # Note: this is almost the same as valparse,
56
+ # except that it does not accept atomic values.
57
+ def textparse(ts)
58
+ if ts.length < 0
59
+ raise Error, 'empty'
60
+ end
61
+
62
+ typ, _, val = ts[0]
63
+ case typ
64
+ when '{' then objparse(ts)
65
+ when '[' then arrparse(ts)
66
+ else
67
+ raise Error, "unexpected #{val.inspect}"
68
+ end
69
+ end
70
+
71
+
72
+ # Parses a "value" in the sense of RFC 4627.
73
+ # Returns the parsed value and any trailing tokens.
74
+ def valparse(ts)
75
+ if ts.length < 0
76
+ raise Error, 'empty'
77
+ end
78
+
79
+ typ, _, val = ts[0]
80
+ case typ
81
+ when '{' then objparse(ts)
82
+ when '[' then arrparse(ts)
83
+ when :val,:str then [val, ts[1..-1]]
84
+ else
85
+ raise Error, "unexpected #{val.inspect}"
86
+ end
87
+ end
88
+
89
+
90
+ # Parses an "object" in the sense of RFC 4627.
91
+ # Returns the parsed value and any trailing tokens.
92
+ def objparse(ts)
93
+ ts = eat('{', ts)
94
+ obj = {}
95
+
96
+ if ts[0][0] == '}'
97
+ return obj, ts[1..-1]
98
+ end
99
+
100
+ k, v, ts = pairparse(ts)
101
+ obj[k] = v
102
+
103
+ if ts[0][0] == '}'
104
+ return obj, ts[1..-1]
105
+ end
106
+
107
+ loop do
108
+ ts = eat(',', ts)
109
+
110
+ k, v, ts = pairparse(ts)
111
+ obj[k] = v
112
+
113
+ if ts[0][0] == '}'
114
+ return obj, ts[1..-1]
115
+ end
116
+ end
117
+ end
118
+
119
+
120
+ # Parses a "member" in the sense of RFC 4627.
121
+ # Returns the parsed values and any trailing tokens.
122
+ def pairparse(ts)
123
+ (typ, _, k), ts = ts[0], ts[1..-1]
124
+ if typ != :str
125
+ raise Error, "unexpected #{k.inspect}"
126
+ end
127
+ ts = eat(':', ts)
128
+ v, ts = valparse(ts)
129
+ [k, v, ts]
130
+ end
131
+
132
+
133
+ # Parses an "array" in the sense of RFC 4627.
134
+ # Returns the parsed value and any trailing tokens.
135
+ def arrparse(ts)
136
+ ts = eat('[', ts)
137
+ arr = []
138
+
139
+ if ts[0][0] == ']'
140
+ return arr, ts[1..-1]
141
+ end
142
+
143
+ v, ts = valparse(ts)
144
+ arr << v
145
+
146
+ if ts[0][0] == ']'
147
+ return arr, ts[1..-1]
148
+ end
149
+
150
+ loop do
151
+ ts = eat(',', ts)
152
+
153
+ v, ts = valparse(ts)
154
+ arr << v
155
+
156
+ if ts[0][0] == ']'
157
+ return arr, ts[1..-1]
158
+ end
159
+ end
160
+ end
161
+
162
+
163
+ def eat(typ, ts)
164
+ if ts[0][0] != typ
165
+ raise Error, "expected #{typ} (got #{ts[0].inspect})"
166
+ end
167
+ ts[1..-1]
168
+ end
169
+
170
+
171
+ # Scans s and returns a list of json tokens,
172
+ # excluding white space (as defined in RFC 4627).
173
+ def lex(s)
174
+ ts = []
175
+ while s.length > 0
176
+ typ, lexeme, val = tok(s)
177
+ if typ == nil
178
+ raise Error, "invalid character at #{s[0,10].inspect}"
179
+ end
180
+ if typ != :space
181
+ ts << [typ, lexeme, val]
182
+ end
183
+ s = s[lexeme.length..-1]
184
+ end
185
+ ts
186
+ end
187
+
188
+
189
+ # Scans the first token in s and
190
+ # returns a 3-element list, or nil
191
+ # if s does not begin with a valid token.
192
+ #
193
+ # The first list element is one of
194
+ # '{', '}', ':', ',', '[', ']',
195
+ # :val, :str, and :space.
196
+ #
197
+ # The second element is the lexeme.
198
+ #
199
+ # The third element is the value of the
200
+ # token for :val and :str, otherwise
201
+ # it is the lexeme.
202
+ def tok(s)
203
+ case s[0]
204
+ when ?{ then ['{', s[0,1], s[0,1]]
205
+ when ?} then ['}', s[0,1], s[0,1]]
206
+ when ?: then [':', s[0,1], s[0,1]]
207
+ when ?, then [',', s[0,1], s[0,1]]
208
+ when ?[ then ['[', s[0,1], s[0,1]]
209
+ when ?] then [']', s[0,1], s[0,1]]
210
+ when ?n then nulltok(s)
211
+ when ?t then truetok(s)
212
+ when ?f then falsetok(s)
213
+ when ?" then strtok(s)
214
+ when Spc then [:space, s[0,1], s[0,1]]
215
+ when ?\t then [:space, s[0,1], s[0,1]]
216
+ when ?\n then [:space, s[0,1], s[0,1]]
217
+ when ?\r then [:space, s[0,1], s[0,1]]
218
+ else numtok(s)
219
+ end
220
+ end
221
+
222
+
223
+ def nulltok(s); s[0,4] == 'null' ? [:val, 'null', nil] : [] end
224
+ def truetok(s); s[0,4] == 'true' ? [:val, 'true', true] : [] end
225
+ def falsetok(s); s[0,5] == 'false' ? [:val, 'false', false] : [] end
226
+
227
+
228
+ def numtok(s)
229
+ m = /-?([1-9][0-9]+|[0-9])([.][0-9]+)?([eE][+-]?[0-9]+)?/.match(s)
230
+ if m && m.begin(0) == 0
231
+ if m[3] && !m[2]
232
+ [:val, m[0], Integer(m[1])*(10**Integer(m[3][1..-1]))]
233
+ elsif m[2]
234
+ [:val, m[0], Float(m[0])]
235
+ else
236
+ [:val, m[0], Integer(m[0])]
237
+ end
238
+ else
239
+ []
240
+ end
241
+ end
242
+
243
+
244
+ def strtok(s)
245
+ m = /"([^"\\]|\\["\/\\bfnrt]|\\u[0-9a-fA-F]{4})*"/.match(s)
246
+ if ! m
247
+ raise Error, "invalid string literal at #{abbrev(s)}"
248
+ end
249
+ [:str, m[0], unquote(m[0])]
250
+ end
251
+
252
+
253
+ def abbrev(s)
254
+ t = s[0,10]
255
+ p = t['`']
256
+ t = t[0,p] if p
257
+ t = t + '...' if t.length < s.length
258
+ '`' + t + '`'
259
+ end
260
+
261
+
262
+ # Converts a quoted json string literal q into a UTF-8-encoded string.
263
+ # The rules are different than for Ruby, so we cannot use eval.
264
+ # Unquote will raise an error if q contains control characters.
265
+ def unquote(q)
266
+ q = q[1...-1]
267
+ a = q.dup # allocate a big enough string
268
+ rubydoesenc = false
269
+ # In ruby >= 1.9, a[w] is a codepoint, not a byte.
270
+ if a.class.method_defined?(:force_encoding)
271
+ a.force_encoding('UTF-8')
272
+ rubydoesenc = true
273
+ end
274
+ r, w = 0, 0
275
+ while r < q.length
276
+ c = q[r]
277
+ case true
278
+ when c == ?\\
279
+ r += 1
280
+ if r >= q.length
281
+ raise Error, "string literal ends with a \"\\\": \"#{q}\""
282
+ end
283
+
284
+ case q[r]
285
+ when ?",?\\,?/,?'
286
+ a[w] = q[r]
287
+ r += 1
288
+ w += 1
289
+ when ?b,?f,?n,?r,?t
290
+ a[w] = Unesc[q[r]]
291
+ r += 1
292
+ w += 1
293
+ when ?u
294
+ r += 1
295
+ uchar = begin
296
+ hexdec4(q[r,4])
297
+ rescue RuntimeError => e
298
+ raise Error, "invalid escape sequence \\u#{q[r,4]}: #{e}"
299
+ end
300
+ r += 4
301
+ if surrogate? uchar
302
+ if q.length >= r+6
303
+ uchar1 = hexdec4(q[r+2,4])
304
+ uchar = subst(uchar, uchar1)
305
+ if uchar != Ucharerr
306
+ # A valid pair; consume.
307
+ r += 6
308
+ end
309
+ end
310
+ end
311
+ if rubydoesenc
312
+ a[w] = '' << uchar
313
+ w += 1
314
+ else
315
+ w += ucharenc(a, w, uchar)
316
+ end
317
+ else
318
+ raise Error, "invalid escape char #{q[r]} in \"#{q}\""
319
+ end
320
+ when c == ?", c < Spc
321
+ raise Error, "invalid character in string literal \"#{q}\""
322
+ else
323
+ # Copy anything else byte-for-byte.
324
+ # Valid UTF-8 will remain valid UTF-8.
325
+ # Invalid UTF-8 will remain invalid UTF-8.
326
+ # In ruby >= 1.9, c is a codepoint, not a byte,
327
+ # in which case this is still what we want.
328
+ a[w] = c
329
+ r += 1
330
+ w += 1
331
+ end
332
+ end
333
+ a[0,w]
334
+ end
335
+
336
+
337
+ # Encodes unicode character u as UTF-8
338
+ # bytes in string a at position i.
339
+ # Returns the number of bytes written.
340
+ def ucharenc(a, i, u)
341
+ case true
342
+ when u <= Uchar1max
343
+ a[i] = (u & 0xff).chr
344
+ 1
345
+ when u <= Uchar2max
346
+ a[i+0] = (Utag2 | ((u>>6)&0xff)).chr
347
+ a[i+1] = (Utagx | (u&Umaskx)).chr
348
+ 2
349
+ when u <= Uchar3max
350
+ a[i+0] = (Utag3 | ((u>>12)&0xff)).chr
351
+ a[i+1] = (Utagx | ((u>>6)&Umaskx)).chr
352
+ a[i+2] = (Utagx | (u&Umaskx)).chr
353
+ 3
354
+ else
355
+ a[i+0] = (Utag4 | ((u>>18)&0xff)).chr
356
+ a[i+1] = (Utagx | ((u>>12)&Umaskx)).chr
357
+ a[i+2] = (Utagx | ((u>>6)&Umaskx)).chr
358
+ a[i+3] = (Utagx | (u&Umaskx)).chr
359
+ 4
360
+ end
361
+ end
362
+
363
+
364
+ def hexdec4(s)
365
+ if s.length != 4
366
+ raise Error, 'short'
367
+ end
368
+ (nibble(s[0])<<12) | (nibble(s[1])<<8) | (nibble(s[2])<<4) | nibble(s[3])
369
+ end
370
+
371
+
372
+ def subst(u1, u2)
373
+ if Usurr1 <= u1 && u1 < Usurr2 && Usurr2 <= u2 && u2 < Usurr3
374
+ return ((u1-Usurr1)<<10) | (u2-Usurr2) + Usurrself
375
+ end
376
+ return Ucharerr
377
+ end
378
+
379
+
380
+ def surrogate?(u)
381
+ Usurr1 <= u && u < Usurr3
382
+ end
383
+
384
+
385
+ def nibble(c)
386
+ case true
387
+ when ?0 <= c && c <= ?9 then c.ord - ?0.ord
388
+ when ?a <= c && c <= ?z then c.ord - ?a.ord + 10
389
+ when ?A <= c && c <= ?Z then c.ord - ?A.ord + 10
390
+ else
391
+ raise Error, "invalid hex code #{c}"
392
+ end
393
+ end
394
+
395
+
396
+ # Encodes x into a json text. It may contain only
397
+ # Array, Hash, String, Numeric, true, false, nil.
398
+ # (Note, this list excludes Symbol.)
399
+ # X itself must be an Array or a Hash.
400
+ # No other value can be encoded, and an error will
401
+ # be raised if x contains any other value, such as
402
+ # Nan, Infinity, Symbol, and Proc, or if a Hash key
403
+ # is not a String.
404
+ # Strings contained in x must be valid UTF-8.
405
+ def encode(x)
406
+ case x
407
+ when Hash then objenc(x)
408
+ when Array then arrenc(x)
409
+ else
410
+ raise Error, 'root value must be an Array or a Hash'
411
+ end
412
+ end
413
+
414
+
415
+ def valenc(x)
416
+ case x
417
+ when Hash then objenc(x)
418
+ when Array then arrenc(x)
419
+ when String then strenc(x)
420
+ when Numeric then numenc(x)
421
+ when true then "true"
422
+ when false then "false"
423
+ when nil then "null"
424
+ else
425
+ raise Error, "cannot encode #{x.class}: #{x.inspect}"
426
+ end
427
+ end
428
+
429
+
430
+ def objenc(x)
431
+ '{' + x.map{|k,v| keyenc(k) + ':' + valenc(v)}.join(',') + '}'
432
+ end
433
+
434
+
435
+ def arrenc(a)
436
+ '[' + a.map{|x| valenc(x)}.join(',') + ']'
437
+ end
438
+
439
+
440
+ def keyenc(k)
441
+ case k
442
+ when String then strenc(k)
443
+ else
444
+ raise Error, "Hash key is not a string: #{k.inspect}"
445
+ end
446
+ end
447
+
448
+
449
+ def strenc(s)
450
+ t = StringIO.new
451
+ t.putc(?")
452
+ r = 0
453
+
454
+ # In ruby >= 1.9, s[r] is a codepoint, not a byte.
455
+ rubydoesenc = s.class.method_defined?(:encoding)
456
+
457
+ while r < s.length
458
+ case s[r]
459
+ when ?" then t.print('\\"')
460
+ when ?\\ then t.print('\\\\')
461
+ when ?\b then t.print('\\b')
462
+ when ?\f then t.print('\\f')
463
+ when ?\n then t.print('\\n')
464
+ when ?\r then t.print('\\r')
465
+ when ?\t then t.print('\\t')
466
+ else
467
+ c = s[r]
468
+ case true
469
+ when rubydoesenc
470
+ begin
471
+ c.ord # will raise an error if c is invalid UTF-8
472
+ t.write(c)
473
+ rescue
474
+ t.write(Ustrerr)
475
+ end
476
+ when Spc <= c && c <= ?~
477
+ t.putc(c)
478
+ else
479
+ n = ucharcopy(t, s, r) # ensure valid UTF-8 output
480
+ r += n - 1 # r is incremented below
481
+ end
482
+ end
483
+ r += 1
484
+ end
485
+ t.putc(?")
486
+ t.string
487
+ end
488
+
489
+
490
+ def numenc(x)
491
+ if ((x.nan? || x.infinite?) rescue false)
492
+ raise Error, "Numeric cannot be represented: #{x}"
493
+ end
494
+ "#{x}"
495
+ end
496
+
497
+
498
+ # Copies the valid UTF-8 bytes of a single character
499
+ # from string s at position i to I/O object t, and
500
+ # returns the number of bytes copied.
501
+ # If no valid UTF-8 char exists at position i,
502
+ # ucharcopy writes Ustrerr and returns 1.
503
+ def ucharcopy(t, s, i)
504
+ n = s.length - i
505
+ raise Utf8Error if n < 1
506
+
507
+ c0 = s[i].ord
508
+
509
+ # 1-byte, 7-bit sequence?
510
+ if c0 < Utagx
511
+ t.putc(c0)
512
+ return 1
513
+ end
514
+
515
+ raise Utf8Error if c0 < Utag2 # unexpected continuation byte?
516
+
517
+ raise Utf8Error if n < 2 # need continuation byte
518
+ c1 = s[i+1].ord
519
+ raise Utf8Error if c1 < Utagx || Utag2 <= c1
520
+
521
+ # 2-byte, 11-bit sequence?
522
+ if c0 < Utag3
523
+ raise Utf8Error if ((c0&Umask2)<<6 | (c1&Umaskx)) <= Uchar1max
524
+ t.putc(c0)
525
+ t.putc(c1)
526
+ return 2
527
+ end
528
+
529
+ # need second continuation byte
530
+ raise Utf8Error if n < 3
531
+
532
+ c2 = s[i+2].ord
533
+ raise Utf8Error if c2 < Utagx || Utag2 <= c2
534
+
535
+ # 3-byte, 16-bit sequence?
536
+ if c0 < Utag4
537
+ u = (c0&Umask3)<<12 | (c1&Umaskx)<<6 | (c2&Umaskx)
538
+ raise Utf8Error if u <= Uchar2max
539
+ t.putc(c0)
540
+ t.putc(c1)
541
+ t.putc(c2)
542
+ return 3
543
+ end
544
+
545
+ # need third continuation byte
546
+ raise Utf8Error if n < 4
547
+ c3 = s[i+3].ord
548
+ raise Utf8Error if c3 < Utagx || Utag2 <= c3
549
+
550
+ # 4-byte, 21-bit sequence?
551
+ if c0 < Utag5
552
+ u = (c0&Umask4)<<18 | (c1&Umaskx)<<12 | (c2&Umaskx)<<6 | (c3&Umaskx)
553
+ raise Utf8Error if u <= Uchar3max
554
+ t.putc(c0)
555
+ t.putc(c1)
556
+ t.putc(c2)
557
+ t.putc(c3)
558
+ return 4
559
+ end
560
+
561
+ raise Utf8Error
562
+ rescue Utf8Error
563
+ t.write(Ustrerr)
564
+ return 1
565
+ end
566
+
567
+
568
+ class Utf8Error < ::StandardError
569
+ end
570
+
571
+
572
+ class Error < ::StandardError
573
+ end
574
+
575
+
576
+ Utagx = 0x80 # 1000 0000
577
+ Utag2 = 0xc0 # 1100 0000
578
+ Utag3 = 0xe0 # 1110 0000
579
+ Utag4 = 0xf0 # 1111 0000
580
+ Utag5 = 0xF8 # 1111 1000
581
+ Umaskx = 0x3f # 0011 1111
582
+ Umask2 = 0x1f # 0001 1111
583
+ Umask3 = 0x0f # 0000 1111
584
+ Umask4 = 0x07 # 0000 0111
585
+ Uchar1max = (1<<7) - 1
586
+ Uchar2max = (1<<11) - 1
587
+ Uchar3max = (1<<16) - 1
588
+ Ucharerr = 0xFFFD # unicode "replacement char"
589
+ Ustrerr = "\xef\xbf\xbd" # unicode "replacement char"
590
+ Usurrself = 0x10000
591
+ Usurr1 = 0xd800
592
+ Usurr2 = 0xdc00
593
+ Usurr3 = 0xe000
594
+
595
+ Spc = ' '[0]
596
+ Unesc = {?b=>?\b, ?f=>?\f, ?n=>?\n, ?r=>?\r, ?t=>?\t}
597
+ end
598
+ end