zscan 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (7) hide show
  1. checksums.yaml +7 -0
  2. data/ext/extconf.rb +3 -0
  3. data/ext/zscan.c +242 -0
  4. data/lib/zscan.rb +47 -0
  5. data/readme.md +52 -0
  6. data/zscan.gemspec +16 -0
  7. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 708c4bbc6710c4d02ed3def05702611c16cd187a
4
+ data.tar.gz: aaf8f91e2a98fcfb18d589d4e6bb6687aeb46ca9
5
+ SHA512:
6
+ metadata.gz: a322da85ae70a1b35f33d02a9d5862250f59058c37109aad9d59b12b91dbdf66068f9740f69570d40c85290bdebf291b108e8b1daa26669e90af748bffa52d45
7
+ data.tar.gz: 5489bccc470ae3d9870cc7887f41bc43887159c1666a093cb4273709f58afcf83ada19dacf74fd74edad918fb2269e131bb1b3e2c6f757af7fc3ad8e92b53837
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile 'zscan'
@@ -0,0 +1,242 @@
1
+ #include <ruby/ruby.h>
2
+ #include <ruby/re.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ typedef struct {
6
+ size_t pos;
7
+ size_t bytepos;
8
+ } Pos;
9
+
10
+ typedef struct {
11
+ size_t pos;
12
+ size_t bytepos;
13
+ VALUE s;
14
+ struct re_registers regs;
15
+ size_t stack_i;
16
+ size_t stack_cap;
17
+ Pos* stack;
18
+ } ZScan;
19
+
20
+ #define P ZScan* p = rb_check_typeddata(self, &zscan_type)
21
+
22
+ static void zscan_mark(void* pp) {
23
+ ZScan* p = pp;
24
+ rb_gc_mark(p->s);
25
+ }
26
+
27
+ static void zscan_free(void* pp) {
28
+ ZScan* p = pp;
29
+ onig_region_free(&(p->regs), 0);
30
+ free(p->stack);
31
+ ruby_xfree(p);
32
+ }
33
+
34
+ extern size_t onig_region_memsize P_((const struct re_registers *regs));
35
+ static size_t zscan_memsize(const void* pp) {
36
+ const ZScan* p = pp;
37
+ return p ? sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs) : 0;
38
+ }
39
+
40
+ static const rb_data_type_t zscan_type = {
41
+ "ZScan",
42
+ {zscan_mark, zscan_free, zscan_memsize}
43
+ };
44
+
45
+ static VALUE zscan_alloc(VALUE klass) {
46
+ ZScan* p = ALLOC(ZScan);
47
+ MEMZERO(p, ZScan, 1);
48
+ onig_region_init(&(p->regs));
49
+ p->s = Qnil;
50
+ p->stack_cap = 5;
51
+ p->stack = (Pos*)malloc(sizeof(Pos) * 5);
52
+ return TypedData_Wrap_Struct(klass, &zscan_type, p);
53
+ }
54
+
55
+ static VALUE zscan_internal_init(VALUE self, VALUE v_s) {
56
+ P;
57
+ p->s = v_s;
58
+ return self;
59
+ }
60
+
61
+ static VALUE zscan_internal_string(VALUE self) {
62
+ P;
63
+ return p->s;
64
+ }
65
+
66
+ static VALUE zscan_pos(VALUE self) {
67
+ P;
68
+ return ULONG2NUM(p->pos);
69
+ }
70
+
71
+ static VALUE zscan_advance(VALUE self, VALUE v_diff) {
72
+ P;
73
+ long signed_n = p->pos + NUM2LONG(v_diff);
74
+ if (signed_n < 0) {
75
+ p->pos = 0;
76
+ p->bytepos = 0;
77
+ return self;
78
+ }
79
+ size_t n = signed_n;
80
+
81
+ // because there's no "reverse scan" API, we have a O(n) routine :(
82
+ if (n < p->pos) {
83
+ p->pos = 0;
84
+ p->bytepos = 0;
85
+ }
86
+
87
+ if (n > p->pos) {
88
+ rb_encoding* enc = rb_enc_get(p->s);
89
+ size_t byteend = RSTRING_LEN(p->s);
90
+ char* ptr = RSTRING_PTR(p->s);
91
+ for (; p->pos < n && p->bytepos < byteend;) {
92
+ int n = rb_enc_mbclen(ptr + p->bytepos, ptr + byteend, enc);
93
+ if (n) {
94
+ p->pos++;
95
+ p->bytepos += n;
96
+ } else {
97
+ break;
98
+ }
99
+ }
100
+ }
101
+ return self;
102
+ }
103
+
104
+ static VALUE zscan_bytepos(VALUE self) {
105
+ P;
106
+ return ULONG2NUM(p->bytepos);
107
+ }
108
+
109
+ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
110
+ P;
111
+ size_t bytepos = NUM2ULONG(v_bytepos);
112
+ size_t from, to;
113
+
114
+ if (bytepos > p->bytepos) {
115
+ from = p->bytepos;
116
+ to = bytepos;
117
+ } else if (bytepos < p->bytepos) {
118
+ from = bytepos;
119
+ to = p->bytepos;
120
+ } else {
121
+ return v_bytepos;
122
+ }
123
+
124
+ rb_encoding* enc = rb_enc_get(p->s);
125
+ char* ptr = RSTRING_PTR(p->s);
126
+ size_t diff = 0;
127
+ for (; from < to;) {
128
+ int n = rb_enc_mbclen(ptr + from, ptr + to, enc);
129
+ if (n) {
130
+ diff++;
131
+ from += n;
132
+ } else {
133
+ if (from < to) {
134
+ rb_raise(rb_eRuntimeError, "the given bytepos splits character");
135
+ return v_bytepos;
136
+ }
137
+ break;
138
+ }
139
+ }
140
+
141
+ if (bytepos > p->bytepos) {
142
+ p->pos += diff;
143
+ } else if (bytepos < p->bytepos) {
144
+ p->pos -= diff;
145
+ }
146
+ p->bytepos = bytepos;
147
+
148
+ return v_bytepos;
149
+ }
150
+
151
+ static VALUE zscan_eos_p(VALUE self) {
152
+ P;
153
+ return (p->bytepos == (size_t)RSTRING_LEN(p->s) ? Qtrue : Qfalse);
154
+ }
155
+
156
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
157
+ static VALUE zscan_bmatch_p(VALUE self, VALUE pattern) {
158
+ P;
159
+ if (TYPE(pattern) == T_STRING) {
160
+ // todo
161
+ } else if (TYPE(pattern) == T_REGEXP) {
162
+ regex_t *re = rb_reg_prepare_re(pattern, p->s);
163
+ int tmpreg = re != RREGEXP(pattern)->ptr;
164
+ if (!tmpreg) RREGEXP(pattern)->usecnt++;
165
+
166
+ char* ptr = RSTRING_PTR(p->s);
167
+ UChar* ptr_end = (UChar*)(ptr + RSTRING_LEN(p->s));
168
+ UChar* ptr_match_from = (UChar*)(ptr + p->bytepos);
169
+ long ret = onig_match(re, (UChar*)ptr, ptr_end, ptr_match_from, &(p->regs), ONIG_OPTION_NONE);
170
+
171
+ if (!tmpreg) {
172
+ RREGEXP(pattern)->usecnt--;
173
+ }
174
+ if (tmpreg) {
175
+ if (RREGEXP(pattern)->usecnt) {
176
+ onig_free(re);
177
+ } else {
178
+ onig_free(RREGEXP(pattern)->ptr);
179
+ RREGEXP(pattern)->ptr = re;
180
+ }
181
+ }
182
+
183
+ if (ret == -2) {
184
+ rb_raise(rb_eRuntimeError, "regexp buffer overflow");
185
+ } else if (ret >= 0) {
186
+ return ULONG2NUM(p->regs.end[0]);
187
+ }
188
+ } else {
189
+ rb_raise(rb_eTypeError, "expect String or Regexp");
190
+ }
191
+
192
+ return Qnil;
193
+ }
194
+
195
+ static VALUE zscan_push_pos(VALUE self) {
196
+ P;
197
+ if (p->stack_i + 1 == p->stack_cap) {
198
+ p->stack_cap *= 2;
199
+ p->stack = (Pos*)realloc(p->stack, sizeof(Pos) * p->stack_cap);
200
+ }
201
+ Pos e = {p->pos, p->bytepos};
202
+ p->stack[++p->stack_i] = e;
203
+ return self;
204
+ }
205
+
206
+ static VALUE zscan_pop_pos(VALUE self) {
207
+ P;
208
+ if (p->stack_i) {
209
+ p->pos = p->stack[p->stack_i].pos;
210
+ p->bytepos = p->stack[p->stack_i].bytepos;
211
+ p->stack_i--;
212
+ } else {
213
+ p->pos = 0;
214
+ p->bytepos = 0;
215
+ }
216
+ return self;
217
+ }
218
+
219
+ static VALUE zscan_drop_top(VALUE self) {
220
+ P;
221
+ if (p->stack_i) {
222
+ p->stack_i--;
223
+ }
224
+ return self;
225
+ }
226
+
227
+ void Init_zscan() {
228
+ VALUE zscan = rb_define_class("ZScan", rb_cObject);
229
+ rb_define_alloc_func(zscan, zscan_alloc);
230
+ rb_define_method(zscan, "_internal_init", zscan_internal_init, 1);
231
+ rb_define_method(zscan, "_internal_string", zscan_internal_string, 0);
232
+
233
+ rb_define_method(zscan, "pos", zscan_pos, 0);
234
+ rb_define_method(zscan, "bytepos", zscan_bytepos, 0);
235
+ rb_define_method(zscan, "bytepos=", zscan_bytepos_eq, 1);
236
+ rb_define_method(zscan, "advance", zscan_advance, 1);
237
+ rb_define_method(zscan, "eos?", zscan_eos_p, 0);
238
+ rb_define_method(zscan, "bmatch?", zscan_bmatch_p, 1);
239
+ rb_define_method(zscan, "push_pos", zscan_push_pos, 0);
240
+ rb_define_method(zscan, "pop_pos", zscan_pop_pos, 0);
241
+ rb_define_method(zscan, "drop_top", zscan_drop_top, 0);
242
+ }
@@ -0,0 +1,47 @@
1
+ require_relative "../ext/zscan"
2
+
3
+ class ZScan
4
+ VERSION = '0.1'
5
+
6
+ def initialize s, dup=false
7
+ _internal_init dup ? s.dup : s
8
+ end
9
+
10
+ def string
11
+ _internal_string.dup
12
+ end
13
+
14
+ def scan re_or_str
15
+ if sz = bmatch?(re_or_str)
16
+ r = _internal_string.byteslice bytepos, sz
17
+ self.bytepos += sz
18
+ r
19
+ end
20
+ end
21
+
22
+ def skip re_or_str
23
+ if sz = bmatch?(re_or_str)
24
+ self.bytepos += sz
25
+ end
26
+ end
27
+
28
+ def pos= new_pos
29
+ advance new_pos - pos
30
+ end
31
+
32
+ def rest
33
+ _internal_string.byteslice bytepos
34
+ end
35
+
36
+ private :_internal_init, :_internal_string
37
+ end
38
+
39
+ # coding: utf-8
40
+ if __FILE__ == $PROGRAM_NAME
41
+ z = ZScan.new 'ab你好'
42
+ z.push_pos
43
+ z.scan /ab你/
44
+ p z.pos
45
+ p z.bytepos
46
+ z.pop_pos
47
+ end
@@ -0,0 +1,52 @@
1
+ ## Motivation
2
+
3
+ A simple string scanner. Provides... much less methods than `StringScanner`.
4
+
5
+ It supports either string or regexp as scan param.
6
+
7
+ `pos` is by codepoints instead of bytes, use `bytepos` to locate byte position.
8
+
9
+ It provides a position stack for you to efficiently manage scanning locations.
10
+
11
+ It correctly scans anchors. The following codes demonstrate the behavior:
12
+
13
+ ```ruby
14
+ require 'zscan'
15
+ z = ZScan.new 'ab'
16
+ z.pos = 1
17
+ z.scan /(?<a)/ #=> ''
18
+ z.scan /^/ #=> nil
19
+ ```
20
+
21
+ While with `StringScanner`:
22
+
23
+ ```ruby
24
+ require 'strscan'
25
+ s = StringScanner.new 'ab'
26
+ s.pos = 1
27
+ s.scan /(?<a)/ #=> nil
28
+ s.scan /^/ #=> ''
29
+ ```
30
+
31
+ See also https://bugs.ruby-lang.org/issues/7092
32
+
33
+ ## Methods
34
+
35
+ - `ZScan.new string, dup=false`
36
+ - `scan regexp_or_string`
37
+ - `skip regexp_or_string`
38
+ - `bmatch? regexp_or_string` returns length of matched bytes or nil
39
+ - `eos?`
40
+ - `string` note: returns a COW dup
41
+ - `rest`
42
+
43
+ ## Position management
44
+
45
+ - `pos`
46
+ - `pos= new_pos` note: complexity ~ `new_pos > pos ? new_pos - pos : new_pos`.
47
+ - `bytepos`
48
+ - `bytepos= new_bytepos` note: complexity ~ `abs(new_bytepos - bytepos)`.
49
+ - `advance n` move forward `n` codepoints, if `n < 0`, move backward. Stops at beginning or end.
50
+ - `push_pos` efficiently pushes current pos into the stack.
51
+ - `pop_pos` efficiently sets current pos to top of the stack, and pops it.
52
+ - `drop_top` drops top of pos stack without changing current pos.
@@ -0,0 +1,16 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "zscan"
3
+ s.version = "0.1"
4
+ s.author = "Zete Lui"
5
+ s.homepage = "https://github.com/luikore/zscan"
6
+ s.platform = Gem::Platform::RUBY
7
+ s.summary = "improved string scanner"
8
+ s.description = "improved string scanner"
9
+ s.required_ruby_version = ">=1.9.2"
10
+
11
+ s.files = %w"readme.md lib/zscan.rb ext/zscan.c ext/extconf.rb zscan.gemspec"
12
+ s.require_paths = ["lib"]
13
+ s.extensions = ["ext/extconf.rb"]
14
+ s.rubygems_version = '1.8.24'
15
+ s.has_rdoc = false
16
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zscan
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Zete Lui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-05-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: improved string scanner
14
+ email:
15
+ executables: []
16
+ extensions:
17
+ - ext/extconf.rb
18
+ extra_rdoc_files: []
19
+ files:
20
+ - readme.md
21
+ - lib/zscan.rb
22
+ - ext/zscan.c
23
+ - ext/extconf.rb
24
+ - zscan.gemspec
25
+ homepage: https://github.com/luikore/zscan
26
+ licenses: []
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 1.9.2
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.0.3
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: improved string scanner
48
+ test_files: []