json_extractor 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Brad Heller
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # JSON Extractor
2
+
3
+ Pull out specific objects from a large JSON document by key without having to
4
+ deserialize the whole object in to Ruby.
5
+
6
+ **NOTE:** This gem only supports extracting objects from JSON documents.
7
+ Strings, numbers, and arrays are not supported. Yet.
8
+
9
+ ## Example
10
+
11
+ Assume that you have a JSON document:
12
+
13
+ ``` json
14
+ {
15
+ "number": 6,
16
+ "name": "Darlington Nagbe",
17
+ "position": "midfielder",
18
+ "statistics": {
19
+ "2012": {
20
+ "gp": 33,
21
+ "gs": 31,
22
+ "g": 6,
23
+ "min": 2777,
24
+ "a": 1,
25
+ "sht": 50,
26
+ "sog": 16,
27
+ "fc": 16,
28
+ "off": 6,
29
+ "y": 0,
30
+ "r": 0
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ The JSON Extractor API will let you pull out a specific object withing that
37
+ file.
38
+
39
+ ``` bash
40
+ $ irb
41
+ 1.9.3p374 :002 > require 'json_extractor'
42
+ => false
43
+ 1.9.3p374 :001 > JSONExtractor.subdocument("/path/to/data.json", "statistics")
44
+ => {"2012"=>{"gp"=>33, "gs"=>31, "g"=>6, "min"=>2777, "a"=>1, "sht"=>50, "sog"=>16, "fc"=>16, "off"=>6, "y"=>0, "r"=>0}}
45
+ ```
46
+
47
+ ## How does it work?
48
+
49
+ The actual implementation is done in C. The whole file is read into memory, the
50
+ key is found somewhere within the file, and the object associated with that key
51
+ is extracted using a simple recursive descent parser that's only aware of the
52
+ semantics of an object.
53
+
54
+ ## Contributing to JSON Extractor
55
+
56
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
57
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
58
+ * Fork the project.
59
+ * Start a feature/bugfix branch.
60
+ * Commit and push until you are happy with your contribution.
61
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
62
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
63
+
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('json_extractor/json_extractor')
@@ -0,0 +1,215 @@
1
+ #include <ruby.h>
2
+
3
+ // Needed for isspace.
4
+ #include <ctype.h>
5
+
6
+ #define PARSER_FUNC(fn) unsigned int fn(char *, long, unsigned int)
7
+
8
+ PARSER_FUNC(object);
9
+ PARSER_FUNC(object_body);
10
+ PARSER_FUNC(start);
11
+ PARSER_FUNC(whitespace);
12
+
13
+ /**
14
+ * Reads the entire file pointed to by filename in to memory and returns a
15
+ * pointer. The caller is responsible for freeing the pointer.
16
+ *
17
+ * @param [const char*] filename the filename
18
+ * @return [char *] a buffer with the file's context. Must be free'd by caller.
19
+ */
20
+ char * read_all(const char * filename) {
21
+ long pos, actual;
22
+ char *data;
23
+ FILE *fp;
24
+
25
+ fp = fopen(filename, "rb");
26
+
27
+ if(!fp) {
28
+ return NULL;
29
+ }
30
+
31
+ fseek(fp, 0L, SEEK_END);
32
+ pos = ftell(fp);
33
+ rewind(fp);
34
+
35
+ // Read the whole damn thing in to memory!
36
+ data = malloc(pos);
37
+ memset(data, '\0', pos);
38
+
39
+ actual = fread(data, sizeof(char), pos, fp);
40
+
41
+ if(actual != pos) {
42
+ free(data);
43
+ return NULL;
44
+ }
45
+
46
+ return data;
47
+ }
48
+
49
+ /**
50
+ * Finds the end of the whitespace that is currently under pos within str.
51
+ *
52
+ * @param [char *] str the string to search.
53
+ * @param [long] len the overall length of the string.
54
+ * @param [unsigned int] pos the initial position (i.e. where to start)
55
+ * @return [unsigned int] the end of the whitespace
56
+ */
57
+ unsigned int whitespace(char * str, long len, unsigned int pos) {
58
+ while(isspace(str[pos]) && pos < len) {
59
+ pos++;
60
+ }
61
+
62
+ if(pos < len) {
63
+ return pos;
64
+ } else {
65
+ return -1;
66
+ }
67
+ }
68
+
69
+ /**
70
+ * Finds the end of the current object body that is under pos within str.
71
+ *
72
+ * @param [char *] str the string to search.
73
+ * @param [long] len the overall length of the string.
74
+ * @param [unsigned int] pos the initial position (i.e. where to start)
75
+ * @return [unsigned int] the end of the object body.
76
+ */
77
+ unsigned int object_body(char * str, long len, unsigned int pos) {
78
+ while(pos < len) {
79
+ if(str[pos+1] == '{') {
80
+ pos = object(str, len, pos+1);
81
+ }
82
+
83
+ if(str[pos+1] == '}') {
84
+ return pos;
85
+ }
86
+
87
+ pos++;
88
+ }
89
+
90
+ // We ran out--it was unescaped.
91
+ return(-1);
92
+ }
93
+
94
+ /**
95
+ * Finds the end of the current object that is under pos within str.
96
+ *
97
+ * @param [char *] str the string to search.
98
+ * @param [long] len the overall length of the string.
99
+ * @param [unsigned int] pos the initial position (i.e. where to start)
100
+ * @return [unsigned int] the end of the object.
101
+ */
102
+ unsigned int object(char * str, long len, unsigned int pos) {
103
+ if(str[pos] != '{') {
104
+ // This is an error--we should tell someone, me thinks.
105
+ return -1;
106
+ }
107
+
108
+ pos = object_body(str, len, pos+1);
109
+
110
+ if(str[pos+1] == '}') {
111
+ return pos+1;
112
+ }
113
+
114
+ // Something else went wrong.
115
+ return(-1);
116
+ }
117
+
118
+ /**
119
+ * Starts parsing.
120
+ *
121
+ * @param [char *] str the string to search.
122
+ * @param [long] len the overall length of the string.
123
+ * @param [unsigned int] pos the initial position (i.e. where to start)
124
+ * @return [unsigned int] the end of the first-encountered object.
125
+ */
126
+ unsigned int start(char * str, long len, unsigned int pos) {
127
+ pos = whitespace(str, len, pos);
128
+
129
+ if(str[pos] == '{') {
130
+ return object(str, len, pos);
131
+ }
132
+
133
+ return -1;
134
+ }
135
+
136
+ int find_subdocument(char * str, unsigned int pos) {
137
+ if(str == NULL) {
138
+ return -1;
139
+ }
140
+
141
+ if(pos >= strlen(str)) {
142
+ return -1;
143
+ }
144
+
145
+ return start(str, strlen(str), pos);
146
+ }
147
+
148
+ char * extract_subdocument(char * data, const char * key) {
149
+ char *full_key, *pos, *final;
150
+ int end;
151
+
152
+ full_key = malloc(strlen(key) + 4);
153
+ memset(full_key, '\0', strlen(key) + 4);
154
+
155
+ sprintf(full_key, "\"%s\":", key);
156
+ pos = strstr(data, full_key);
157
+
158
+ if(pos == NULL) {
159
+ free(full_key);
160
+ return NULL;
161
+ }
162
+
163
+ // Let's find the end of the document.
164
+ end = find_subdocument(&pos[strlen(full_key)], 0);
165
+
166
+ if(end < 0) {
167
+ final = NULL;
168
+ } else {
169
+ // Since we get a position, it's actually off by one from the original
170
+ // position.
171
+ final = malloc(end+2);
172
+ memset(final, '\0', end+2);
173
+ memcpy(final, &pos[strlen(full_key)], end+1);
174
+ }
175
+
176
+ free(full_key);
177
+
178
+ return final;
179
+ }
180
+
181
+ static VALUE rb_extract_subdocument(VALUE self, VALUE str, VALUE key) {
182
+ char *data, *substr;
183
+ VALUE result;
184
+
185
+ // No str?
186
+ if(str == Qnil) {
187
+ return Qnil;
188
+ }
189
+
190
+ // No key?
191
+ if(key == Qnil) {
192
+ return Qnil;
193
+ }
194
+
195
+ data = read_all(RSTRING_PTR(str));
196
+ substr = extract_subdocument(data, RSTRING_PTR(key));
197
+ free(data);
198
+
199
+ if(substr == NULL) {
200
+ return Qnil;
201
+ }
202
+
203
+ result = rb_str_new2(substr);
204
+
205
+ // TODO: Figure out if this is right. It behaves properly, so I'm assuming
206
+ // it's OK.
207
+ free(substr);
208
+
209
+ return result;
210
+ }
211
+
212
+ void Init_json_extractor(void) {
213
+ VALUE rb_mJSONExtractor = rb_define_module("JSONExtractor");
214
+ rb_define_singleton_method(rb_mJSONExtractor, "extract_subdocument", rb_extract_subdocument, 2);
215
+ }
@@ -0,0 +1,9 @@
1
+ require 'json'
2
+ $:.unshift(File.dirname(__FILE__))
3
+ require 'json_extractor/json_extractor'
4
+
5
+ module JSONExtractor
6
+ def self.subdocument(filename, key)
7
+ JSON.parse(extract_subdocument(filename, key))
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: json_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brad Heller
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-30 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: jeweler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.8.4
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.8.4
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake-compiler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: A set of C extensions that can extract specific keys from a JSON document.
63
+ Right now, only supports extracting objects.
64
+ email: brad@cloudability.com
65
+ executables: []
66
+ extensions:
67
+ - ext/json_extractor/extconf.rb
68
+ extra_rdoc_files:
69
+ - LICENSE.txt
70
+ - README.md
71
+ files:
72
+ - ext/json_extractor/extconf.rb
73
+ - ext/json_extractor/json_extractor.c
74
+ - lib/json_extractor.rb
75
+ - LICENSE.txt
76
+ - README.md
77
+ homepage: http://github.com/bradhe/json_extractor
78
+ licenses:
79
+ - MIT
80
+ post_install_message:
81
+ rdoc_options: []
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ! '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ segments:
91
+ - 0
92
+ hash: -2562154935140798148
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ requirements: []
100
+ rubyforge_project:
101
+ rubygems_version: 1.8.24
102
+ signing_key:
103
+ specification_version: 3
104
+ summary: Tools for extracting JSON without having to deserialize it.
105
+ test_files: []