mmapscanner 0.1a

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ MmapScanner
2
+ ===========
3
+
4
+ Description
5
+ -----------
6
+
7
+ 文字列の代わりにファイルを mmap(2) した領域に対して StringScanner のようなことをするものです。
8
+
9
+ Installation
10
+ ------------
11
+
12
+ $ cd ext
13
+ $ ruby ./extconf.rb
14
+ $ make
15
+ $ sudo make install
16
+
17
+ Gem Installation
18
+ ----------------
19
+
20
+ $ gem install mmapscanner
21
+
22
+ Features
23
+ --------
24
+
25
+ * ファイルを mmap(2) した領域から正規表現に適合した部分データを返します。
26
+ * 返されるデータも MmapScanner オブジェクトで、最初にファイルから mmap(2) した領域を共有しています。
27
+ * mmap(2) を使用しているので大量データでもメモリを消費しません。to_s することではじめて String オブジェクトを生成します。
28
+
29
+ Usage
30
+ -----
31
+
32
+ * MmapScanner.new でファイルを mmap(2) します。mmap(2) できないファイルやパラメータを渡すとエラーになります。
33
+
34
+ # ファイル全体を mmap
35
+ ms = MmapScanner.new(File.open("filename"))
36
+ # ファイルの先頭 4096 バイト以降を mmap
37
+ ms = MmapScanner.new(File.open("filename"), 4096)
38
+ # ファイルの先頭 4096 バイト以降の 1234 バイト分を mmap
39
+ ms = MmapScanner.new(File.open("filename"), 4096, 1234)
40
+
41
+ * size, length は mmap(2) したサイズを返します。
42
+ * to_s は mmap(2) した領域を String で返します。Encoding は常に ASCII-8BIT です。
43
+ * slice は mmap(2) した領域の一部を新たな MmapScanner オブジェクトで返します。
44
+ * scan は正規表現に一致した部分を返し、ポインタを進めます。一致しない場合は nil を返します。
45
+ * check は scan と同じですが、ポインタを進めません。
46
+ * skip は scan と同じですが、一致したバイト数を返します。
47
+ * match? は check と同じですが、一致したバイト数を返します。
48
+ * peek は指定したバイト数分のデータを返します。ポインタは進みません。
49
+ * eos? はポインタが末尾に達していると true を返します。
50
+ * rest はポインタ以降のデータを返します。
51
+
52
+ Copyright
53
+ ---------
54
+
55
+ <dl>
56
+ <dt>Author<dd>TOMITA Masahiro <tommy@tmtm.org>
57
+ <dt>Copyrigh<dd>Copyright (c) 2011 TOMITA Masahiro
58
+ <dt>License<dd>Ruby's
59
+ </dl>
60
+
data/ext/extconf.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'mkmf'
2
+ create_makefile 'mmapscanner'
data/ext/mmapscanner.c ADDED
@@ -0,0 +1,254 @@
1
+ #include <sys/types.h>
2
+ #include <sys/stat.h>
3
+ #include <unistd.h>
4
+ #include <sys/mman.h>
5
+ #include <ruby.h>
6
+ #include <ruby/io.h>
7
+
8
+ static VALUE cMmapScanner;
9
+
10
+ typedef struct {
11
+ char *ptr;
12
+ size_t size;
13
+ size_t pos;
14
+ } mmap_data_t;
15
+
16
+ static void mmap_free(mmap_data_t *data)
17
+ {
18
+ if (data->ptr)
19
+ munmap(data->ptr, data->size);
20
+ free(data);
21
+ }
22
+
23
+ static VALUE allocate(VALUE klass)
24
+ {
25
+ VALUE obj;
26
+ mmap_data_t *data;
27
+
28
+ data = xmalloc(sizeof(mmap_data_t));
29
+ data->ptr = NULL;
30
+ data->size = 0;
31
+ data->pos = 0;
32
+ obj = Data_Wrap_Struct(klass, 0, mmap_free, data);
33
+ rb_iv_set(obj, "parent", Qnil);
34
+ return obj;
35
+ }
36
+
37
+ static VALUE initialize(int argc, VALUE *argv, VALUE obj)
38
+ {
39
+ VALUE src, size, pos, p, pp;
40
+ int fd;
41
+ void *ptr;
42
+ mmap_data_t *data, *parent;
43
+ struct stat st;
44
+ size_t sz, offset;
45
+
46
+ Data_Get_Struct(obj, mmap_data_t, data);
47
+ if (data->ptr)
48
+ rb_raise(rb_eRuntimeError, "already initialized");
49
+ rb_scan_args(argc, argv, "12", &src, &pos, &size);
50
+ if (pos != Qnil && NUM2LL(pos) < 0)
51
+ rb_raise(rb_eRangeError, "position out of range: %lld", NUM2LL(pos));
52
+ if (size != Qnil && NUM2LL(size) < 0)
53
+ rb_raise(rb_eRangeError, "length out of range: %lld", NUM2LL(size));
54
+ offset = pos == Qnil ? 0 : NUM2SIZET(pos);
55
+ if (rb_obj_class(src) == cMmapScanner) {
56
+ Data_Get_Struct(src, mmap_data_t, parent);
57
+ if (offset >= parent->size)
58
+ rb_raise(rb_eRangeError, "length out of range: %zu >= %zu", offset, parent->size);
59
+ sz = size == Qnil ? parent->size - offset : NUM2SIZET(size);
60
+ ptr = parent->ptr + offset;
61
+ if (sz > parent->size - offset)
62
+ sz = parent->size-offset;
63
+ data->ptr = ptr;
64
+ data->size = sz;
65
+ p = src;
66
+ while ((pp = rb_iv_get(p, "parent")) != Qnil)
67
+ p = pp;
68
+ rb_iv_set(obj, "parent", p);
69
+ return;
70
+ }
71
+ Check_Type(src, T_FILE);
72
+ fd = RFILE(src)->fptr->fd;
73
+ fstat(fd, &st);
74
+ sz = size == Qnil ? st.st_size - offset : NUM2SIZET(size);
75
+ if (sz > st.st_size - offset)
76
+ sz = st.st_size - offset;
77
+ if ((ptr = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, offset)) == MAP_FAILED) {
78
+ rb_exc_raise(rb_funcall(rb_eSystemCallError, rb_intern("new"), 1, INT2FIX(errno)));
79
+ }
80
+ data->ptr = ptr;
81
+ data->size = sz;
82
+ }
83
+
84
+ static VALUE size(VALUE obj)
85
+ {
86
+ mmap_data_t *data;
87
+
88
+ Data_Get_Struct(obj, mmap_data_t, data);
89
+ return ULL2NUM(data->size);
90
+ }
91
+
92
+ static VALUE to_s(VALUE obj)
93
+ {
94
+ mmap_data_t *data;
95
+
96
+ Data_Get_Struct(obj, mmap_data_t, data);
97
+ return rb_str_new(data->ptr, data->size);
98
+ }
99
+
100
+ static VALUE slice(VALUE obj, VALUE pos, VALUE len)
101
+ {
102
+ size_t offset;
103
+ size_t length;
104
+ mmap_data_t *data;
105
+
106
+ Data_Get_Struct(obj, mmap_data_t, data);
107
+ return rb_funcall(cMmapScanner, rb_intern("new"), 3, obj, pos, len);
108
+ }
109
+
110
+ static VALUE inspect(VALUE obj)
111
+ {
112
+ rb_str_new2("#<MmapScanner>");
113
+ }
114
+
115
+ static VALUE pos(VALUE obj)
116
+ {
117
+ mmap_data_t *data;
118
+
119
+ Data_Get_Struct(obj, mmap_data_t, data);
120
+ return ULL2NUM(data->pos);
121
+ }
122
+
123
+ static VALUE set_pos(VALUE obj, VALUE pos)
124
+ {
125
+ mmap_data_t *data;
126
+ size_t p;
127
+
128
+ if (NUM2LL(pos) < 0)
129
+ rb_raise(rb_eRangeError, "out of range: %lld", NUM2LL(pos));
130
+ Data_Get_Struct(obj, mmap_data_t, data);
131
+ p = NUM2SIZET(pos);
132
+ if (p > data->size)
133
+ rb_raise(rb_eRangeError, "out of range: %zu > %zu", p, data->size);
134
+ data->pos = p;
135
+ return pos;
136
+ }
137
+
138
+ static VALUE scan_sub(VALUE obj, VALUE re, int forward)
139
+ {
140
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
141
+ mmap_data_t *data;
142
+ regex_t *reg;
143
+ int tmpreg;
144
+ int result;
145
+ struct re_registers regs;
146
+ size_t old_pos, matched_len;
147
+
148
+ Check_Type(re, T_REGEXP);
149
+ Data_Get_Struct(obj, mmap_data_t, data);
150
+ if (data->pos >= data->size)
151
+ return Qnil;
152
+
153
+ reg = rb_reg_prepare_re(re, rb_str_new("", 0));
154
+ tmpreg = reg != RREGEXP(re)->ptr;
155
+ if (!tmpreg) RREGEXP(re)->usecnt++;
156
+
157
+ onig_region_init(&regs);
158
+ result = onig_match(reg, (UChar* )(data->ptr+data->pos),
159
+ (UChar* )(data->ptr+data->size),
160
+ (UChar* )(data->ptr+data->pos),
161
+ &regs, ONIG_OPTION_NONE);
162
+ if (!tmpreg) RREGEXP(re)->usecnt--;
163
+ if (tmpreg) {
164
+ if (RREGEXP(re)->usecnt) {
165
+ onig_free(reg);
166
+ } else {
167
+ onig_free(RREGEXP(re)->ptr);
168
+ RREGEXP(re)->ptr = reg;
169
+ }
170
+ }
171
+ if (result < 0)
172
+ return Qnil;
173
+ old_pos = data->pos;
174
+ matched_len = regs.end[0];
175
+ if (forward)
176
+ data->pos += matched_len;
177
+ return rb_funcall(cMmapScanner, rb_intern("new"), 3, obj, ULL2NUM(old_pos), ULL2NUM(matched_len));
178
+ }
179
+
180
+ static VALUE scan(VALUE obj, VALUE re)
181
+ {
182
+ return scan_sub(obj, re, 1);
183
+ }
184
+
185
+ static VALUE check(VALUE obj, VALUE re)
186
+ {
187
+ return scan_sub(obj, re, 0);
188
+ }
189
+
190
+ static VALUE skip(VALUE obj, VALUE re)
191
+ {
192
+ mmap_data_t *data;
193
+ VALUE ret = scan_sub(obj, re, 1);
194
+ if (ret == Qnil)
195
+ return ret;
196
+ Data_Get_Struct(ret, mmap_data_t, data);
197
+ return ULL2NUM(data->size);
198
+ }
199
+
200
+ static VALUE match_p(VALUE obj, VALUE re)
201
+ {
202
+ mmap_data_t *data;
203
+ VALUE ret = scan_sub(obj, re, 0);
204
+ if (ret == Qnil)
205
+ return ret;
206
+ Data_Get_Struct(ret, mmap_data_t, data);
207
+ return ULL2NUM(data->size);
208
+ }
209
+
210
+ static VALUE peek(VALUE obj, VALUE size)
211
+ {
212
+ size_t sz = NUM2SIZET(size);
213
+ mmap_data_t *data;
214
+ Data_Get_Struct(obj, mmap_data_t, data);
215
+ if (sz > data->size - data->pos)
216
+ sz = data->size - data->pos;
217
+ return rb_funcall(cMmapScanner, rb_intern("new"), 3, obj, SIZET2NUM(data->pos), SIZET2NUM(sz));
218
+ }
219
+
220
+ static VALUE eos_p(VALUE obj)
221
+ {
222
+ mmap_data_t *data;
223
+ Data_Get_Struct(obj, mmap_data_t, data);
224
+ return data->pos >= data->size ? Qtrue : Qfalse;
225
+ }
226
+
227
+ static VALUE rest(VALUE obj)
228
+ {
229
+ mmap_data_t *data;
230
+ Data_Get_Struct(obj, mmap_data_t, data);
231
+ return rb_funcall(cMmapScanner, rb_intern("new"), 2, obj, SIZET2NUM(data->pos));
232
+ }
233
+
234
+ void Init_mmapscanner(void)
235
+ {
236
+ cMmapScanner = rb_define_class("MmapScanner", rb_cObject);
237
+ rb_define_alloc_func(cMmapScanner, allocate);
238
+ rb_define_method(cMmapScanner, "initialize", initialize, -1);
239
+ rb_define_method(cMmapScanner, "size", size, 0);
240
+ rb_define_method(cMmapScanner, "length", size, 0);
241
+ rb_define_method(cMmapScanner, "to_s", to_s, 0);
242
+ rb_define_method(cMmapScanner, "slice", slice, 2);
243
+ // rb_define_method(cMmapScanner, "[]", slice, 2);
244
+ rb_define_method(cMmapScanner, "inspect", inspect, 0);
245
+ rb_define_method(cMmapScanner, "pos", pos, 0);
246
+ rb_define_method(cMmapScanner, "pos=", set_pos, 1);
247
+ rb_define_method(cMmapScanner, "scan", scan, 1);
248
+ rb_define_method(cMmapScanner, "check", check, 1);
249
+ rb_define_method(cMmapScanner, "skip", skip, 1);
250
+ rb_define_method(cMmapScanner, "match?", match_p, 1);
251
+ rb_define_method(cMmapScanner, "peek", peek, 1);
252
+ rb_define_method(cMmapScanner, "eos?", eos_p, 0);
253
+ rb_define_method(cMmapScanner, "rest", rest, 0);
254
+ }
@@ -0,0 +1,141 @@
1
+ require 'tempfile'
2
+
3
+ $LOAD_PATH.unshift "#{File.dirname __FILE__}/../ext"
4
+ require 'mmapscanner'
5
+
6
+ describe MmapScanner do
7
+ before do
8
+ tmpf = Tempfile.new 'mmapscanner'
9
+ tmpf.write '0123456789'*1000
10
+ @file = File.open(tmpf.path)
11
+ end
12
+ subject{MmapScanner.new(@file)}
13
+ it '#size returns size of file' do
14
+ subject.size.should == 10000
15
+ end
16
+ it '#to_s returns contents of file' do
17
+ subject.to_s.should == '0123456789'*1000
18
+ end
19
+ describe '#slice' do
20
+ it 'returns MmapScanner' do
21
+ subject.slice(10, 100).should be_instance_of MmapScanner
22
+ end
23
+ end
24
+ it '#inspect returns "#<MmapScanner>"' do
25
+ subject.inspect.should == '#<MmapScanner>'
26
+ end
27
+ it '#pos returns current position' do
28
+ subject.pos.should == 0
29
+ subject.scan(/.../)
30
+ subject.pos.should == 3
31
+ end
32
+ describe '#pos=' do
33
+ it 'change current position' do
34
+ subject.pos = 100
35
+ subject.pos.should == 100
36
+ end
37
+ it 'raise error when negative value' do
38
+ expect{subject.pos = -1}.to raise_error(RangeError, 'out of range: -1')
39
+ end
40
+ it 'raise error when over size' do
41
+ expect{subject.pos = 10001}.to raise_error(RangeError, 'out of range: 10001 > 10000')
42
+ expect{subject.pos = 20000}.to raise_error(RangeError, 'out of range: 20000 > 10000')
43
+ end
44
+ end
45
+ describe '#scan' do
46
+ it 'returns matched data as MmapScanner' do
47
+ ret = subject.scan(/\d{10}/)
48
+ ret.class.should == MmapScanner
49
+ ret.to_s.should == '0123456789'
50
+ end
51
+ it 'returns nil if not matched' do
52
+ subject.scan(/123/).should be_nil
53
+ end
54
+ it 'forward current position' do
55
+ subject.scan(/\d{10}/)
56
+ subject.pos.should == 10
57
+ end
58
+ end
59
+ describe '#check' do
60
+ it 'returns matched data as MmapScanner' do
61
+ ret = subject.check(/\d{10}/)
62
+ ret.class.should == MmapScanner
63
+ ret.to_s.should == '0123456789'
64
+ end
65
+ it 'returns nil if not matched' do
66
+ subject.check(/123/).should be_nil
67
+ end
68
+ it 'do not forward current position' do
69
+ ret = subject.check(/\d{10}/)
70
+ subject.pos.should == 0
71
+ end
72
+ end
73
+ describe '#skip' do
74
+ it 'returns length of matched data' do
75
+ subject.skip(/\d{10}/).should == 10
76
+ end
77
+ it 'returns nil if not matched' do
78
+ subject.skip(/123/).should be_nil
79
+ end
80
+ it 'forward current position' do
81
+ subject.skip(/\d{10}/)
82
+ subject.pos.should == 10
83
+ end
84
+ end
85
+ describe '#match?' do
86
+ it 'returns length of matched data' do
87
+ subject.match?(/\d{10}/).should == 10
88
+ end
89
+ it 'returns nil if not matched' do
90
+ subject.match?(/123/).should be_nil
91
+ end
92
+ it 'do not forward current position' do
93
+ subject.match?(/\d{10}/)
94
+ subject.pos.should == 0
95
+ end
96
+ end
97
+ describe '#peek' do
98
+ it 'returns MmapScanner' do
99
+ subject.peek(10).should be_instance_of MmapScanner
100
+ end
101
+ it 'do not forward current position' do
102
+ subject.peek(10)
103
+ subject.pos.should == 0
104
+ end
105
+ end
106
+ describe '#eos?' do
107
+ it 'returns true if eos' do
108
+ subject.pos = 10000
109
+ subject.eos?.should == true
110
+ end
111
+ it 'returns false if not eos' do
112
+ subject.pos = 9999
113
+ subject.eos?.should == false
114
+ end
115
+ end
116
+ describe '#rest' do
117
+ it 'returns rest data as MmapScanner' do
118
+ subject.pos = 9997
119
+ ret = subject.rest
120
+ ret.should be_instance_of MmapScanner
121
+ ret.to_s.should == '789'
122
+ end
123
+ end
124
+ describe '.new with position' do
125
+ it '#size is length of rest data' do
126
+ MmapScanner.new(@file, 4096).size.should == 10000-4096
127
+ end
128
+ it 'raise error when invalid position' do
129
+ expect{MmapScanner.new(@file, 4095)}.to raise_error(Errno::EINVAL)
130
+ end
131
+ end
132
+ describe '.new with length' do
133
+ subject{MmapScanner.new(@file, nil, 10)}
134
+ it '#size is specified size' do
135
+ subject.size.should == 10
136
+ end
137
+ it 'raise error when negative' do
138
+ expect{MmapScanner.new(@file, nil, -1)}.to raise_error(RangeError, 'length out of range: -1')
139
+ end
140
+ end
141
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mmapscanner
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: 3
5
+ version: 0.1a
6
+ platform: ruby
7
+ authors:
8
+ - TOMITA Masahiro
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-02-05 00:00:00 +09:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description:
18
+ email: tommy@tmtm.org
19
+ executables: []
20
+
21
+ extensions:
22
+ - ext/extconf.rb
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - README.md
27
+ - ext/mmapscanner.c
28
+ - spec/mmapscanner_spec.rb
29
+ - ext/extconf.rb
30
+ has_rdoc: true
31
+ homepage: http://github.com/tmtm/mmapscanner
32
+ licenses:
33
+ - Ruby's
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: 1.9.2
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ">"
49
+ - !ruby/object:Gem::Version
50
+ version: 1.3.1
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.5.0
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: MmapScanner like StringScanner but it use mmap(2)-ed data
58
+ test_files:
59
+ - spec/mmapscanner_spec.rb