zscan 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. checksums.yaml +7 -0
  2. data/ext/extconf.rb +3 -0
  3. data/ext/zscan.c +242 -0
  4. data/lib/zscan.rb +47 -0
  5. data/readme.md +52 -0
  6. data/zscan.gemspec +16 -0
  7. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 708c4bbc6710c4d02ed3def05702611c16cd187a
4
+ data.tar.gz: aaf8f91e2a98fcfb18d589d4e6bb6687aeb46ca9
5
+ SHA512:
6
+ metadata.gz: a322da85ae70a1b35f33d02a9d5862250f59058c37109aad9d59b12b91dbdf66068f9740f69570d40c85290bdebf291b108e8b1daa26669e90af748bffa52d45
7
+ data.tar.gz: 5489bccc470ae3d9870cc7887f41bc43887159c1666a093cb4273709f58afcf83ada19dacf74fd74edad918fb2269e131bb1b3e2c6f757af7fc3ad8e92b53837
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile 'zscan'
@@ -0,0 +1,242 @@
1
+ #include <ruby/ruby.h>
2
+ #include <ruby/re.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ typedef struct {
6
+ size_t pos;
7
+ size_t bytepos;
8
+ } Pos;
9
+
10
+ typedef struct {
11
+ size_t pos;
12
+ size_t bytepos;
13
+ VALUE s;
14
+ struct re_registers regs;
15
+ size_t stack_i;
16
+ size_t stack_cap;
17
+ Pos* stack;
18
+ } ZScan;
19
+
20
+ #define P ZScan* p = rb_check_typeddata(self, &zscan_type)
21
+
22
+ static void zscan_mark(void* pp) {
23
+ ZScan* p = pp;
24
+ rb_gc_mark(p->s);
25
+ }
26
+
27
+ static void zscan_free(void* pp) {
28
+ ZScan* p = pp;
29
+ onig_region_free(&(p->regs), 0);
30
+ free(p->stack);
31
+ ruby_xfree(p);
32
+ }
33
+
34
+ extern size_t onig_region_memsize P_((const struct re_registers *regs));
35
+ static size_t zscan_memsize(const void* pp) {
36
+ const ZScan* p = pp;
37
+ return p ? sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs) : 0;
38
+ }
39
+
40
+ static const rb_data_type_t zscan_type = {
41
+ "ZScan",
42
+ {zscan_mark, zscan_free, zscan_memsize}
43
+ };
44
+
45
+ static VALUE zscan_alloc(VALUE klass) {
46
+ ZScan* p = ALLOC(ZScan);
47
+ MEMZERO(p, ZScan, 1);
48
+ onig_region_init(&(p->regs));
49
+ p->s = Qnil;
50
+ p->stack_cap = 5;
51
+ p->stack = (Pos*)malloc(sizeof(Pos) * 5);
52
+ return TypedData_Wrap_Struct(klass, &zscan_type, p);
53
+ }
54
+
55
+ static VALUE zscan_internal_init(VALUE self, VALUE v_s) {
56
+ P;
57
+ p->s = v_s;
58
+ return self;
59
+ }
60
+
61
+ static VALUE zscan_internal_string(VALUE self) {
62
+ P;
63
+ return p->s;
64
+ }
65
+
66
+ static VALUE zscan_pos(VALUE self) {
67
+ P;
68
+ return ULONG2NUM(p->pos);
69
+ }
70
+
71
+ static VALUE zscan_advance(VALUE self, VALUE v_diff) {
72
+ P;
73
+ long signed_n = p->pos + NUM2LONG(v_diff);
74
+ if (signed_n < 0) {
75
+ p->pos = 0;
76
+ p->bytepos = 0;
77
+ return self;
78
+ }
79
+ size_t n = signed_n;
80
+
81
+ // because there's no "reverse scan" API, we have a O(n) routine :(
82
+ if (n < p->pos) {
83
+ p->pos = 0;
84
+ p->bytepos = 0;
85
+ }
86
+
87
+ if (n > p->pos) {
88
+ rb_encoding* enc = rb_enc_get(p->s);
89
+ size_t byteend = RSTRING_LEN(p->s);
90
+ char* ptr = RSTRING_PTR(p->s);
91
+ for (; p->pos < n && p->bytepos < byteend;) {
92
+ int n = rb_enc_mbclen(ptr + p->bytepos, ptr + byteend, enc);
93
+ if (n) {
94
+ p->pos++;
95
+ p->bytepos += n;
96
+ } else {
97
+ break;
98
+ }
99
+ }
100
+ }
101
+ return self;
102
+ }
103
+
104
+ static VALUE zscan_bytepos(VALUE self) {
105
+ P;
106
+ return ULONG2NUM(p->bytepos);
107
+ }
108
+
109
+ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
110
+ P;
111
+ size_t bytepos = NUM2ULONG(v_bytepos);
112
+ size_t from, to;
113
+
114
+ if (bytepos > p->bytepos) {
115
+ from = p->bytepos;
116
+ to = bytepos;
117
+ } else if (bytepos < p->bytepos) {
118
+ from = bytepos;
119
+ to = p->bytepos;
120
+ } else {
121
+ return v_bytepos;
122
+ }
123
+
124
+ rb_encoding* enc = rb_enc_get(p->s);
125
+ char* ptr = RSTRING_PTR(p->s);
126
+ size_t diff = 0;
127
+ for (; from < to;) {
128
+ int n = rb_enc_mbclen(ptr + from, ptr + to, enc);
129
+ if (n) {
130
+ diff++;
131
+ from += n;
132
+ } else {
133
+ if (from < to) {
134
+ rb_raise(rb_eRuntimeError, "the given bytepos splits character");
135
+ return v_bytepos;
136
+ }
137
+ break;
138
+ }
139
+ }
140
+
141
+ if (bytepos > p->bytepos) {
142
+ p->pos += diff;
143
+ } else if (bytepos < p->bytepos) {
144
+ p->pos -= diff;
145
+ }
146
+ p->bytepos = bytepos;
147
+
148
+ return v_bytepos;
149
+ }
150
+
151
+ static VALUE zscan_eos_p(VALUE self) {
152
+ P;
153
+ return (p->bytepos == (size_t)RSTRING_LEN(p->s) ? Qtrue : Qfalse);
154
+ }
155
+
156
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
157
+ static VALUE zscan_bmatch_p(VALUE self, VALUE pattern) {
158
+ P;
159
+ if (TYPE(pattern) == T_STRING) {
160
+ // todo
161
+ } else if (TYPE(pattern) == T_REGEXP) {
162
+ regex_t *re = rb_reg_prepare_re(pattern, p->s);
163
+ int tmpreg = re != RREGEXP(pattern)->ptr;
164
+ if (!tmpreg) RREGEXP(pattern)->usecnt++;
165
+
166
+ char* ptr = RSTRING_PTR(p->s);
167
+ UChar* ptr_end = (UChar*)(ptr + RSTRING_LEN(p->s));
168
+ UChar* ptr_match_from = (UChar*)(ptr + p->bytepos);
169
+ long ret = onig_match(re, (UChar*)ptr, ptr_end, ptr_match_from, &(p->regs), ONIG_OPTION_NONE);
170
+
171
+ if (!tmpreg) {
172
+ RREGEXP(pattern)->usecnt--;
173
+ }
174
+ if (tmpreg) {
175
+ if (RREGEXP(pattern)->usecnt) {
176
+ onig_free(re);
177
+ } else {
178
+ onig_free(RREGEXP(pattern)->ptr);
179
+ RREGEXP(pattern)->ptr = re;
180
+ }
181
+ }
182
+
183
+ if (ret == -2) {
184
+ rb_raise(rb_eRuntimeError, "regexp buffer overflow");
185
+ } else if (ret >= 0) {
186
+ return ULONG2NUM(p->regs.end[0]);
187
+ }
188
+ } else {
189
+ rb_raise(rb_eTypeError, "expect String or Regexp");
190
+ }
191
+
192
+ return Qnil;
193
+ }
194
+
195
+ static VALUE zscan_push_pos(VALUE self) {
196
+ P;
197
+ if (p->stack_i + 1 == p->stack_cap) {
198
+ p->stack_cap *= 2;
199
+ p->stack = (Pos*)realloc(p->stack, sizeof(Pos) * p->stack_cap);
200
+ }
201
+ Pos e = {p->pos, p->bytepos};
202
+ p->stack[++p->stack_i] = e;
203
+ return self;
204
+ }
205
+
206
+ static VALUE zscan_pop_pos(VALUE self) {
207
+ P;
208
+ if (p->stack_i) {
209
+ p->pos = p->stack[p->stack_i].pos;
210
+ p->bytepos = p->stack[p->stack_i].bytepos;
211
+ p->stack_i--;
212
+ } else {
213
+ p->pos = 0;
214
+ p->bytepos = 0;
215
+ }
216
+ return self;
217
+ }
218
+
219
+ static VALUE zscan_drop_top(VALUE self) {
220
+ P;
221
+ if (p->stack_i) {
222
+ p->stack_i--;
223
+ }
224
+ return self;
225
+ }
226
+
227
+ void Init_zscan() {
228
+ VALUE zscan = rb_define_class("ZScan", rb_cObject);
229
+ rb_define_alloc_func(zscan, zscan_alloc);
230
+ rb_define_method(zscan, "_internal_init", zscan_internal_init, 1);
231
+ rb_define_method(zscan, "_internal_string", zscan_internal_string, 0);
232
+
233
+ rb_define_method(zscan, "pos", zscan_pos, 0);
234
+ rb_define_method(zscan, "bytepos", zscan_bytepos, 0);
235
+ rb_define_method(zscan, "bytepos=", zscan_bytepos_eq, 1);
236
+ rb_define_method(zscan, "advance", zscan_advance, 1);
237
+ rb_define_method(zscan, "eos?", zscan_eos_p, 0);
238
+ rb_define_method(zscan, "bmatch?", zscan_bmatch_p, 1);
239
+ rb_define_method(zscan, "push_pos", zscan_push_pos, 0);
240
+ rb_define_method(zscan, "pop_pos", zscan_pop_pos, 0);
241
+ rb_define_method(zscan, "drop_top", zscan_drop_top, 0);
242
+ }
@@ -0,0 +1,47 @@
1
+ require_relative "../ext/zscan"
2
+
3
+ class ZScan
4
+ VERSION = '0.1'
5
+
6
+ def initialize s, dup=false
7
+ _internal_init dup ? s.dup : s
8
+ end
9
+
10
+ def string
11
+ _internal_string.dup
12
+ end
13
+
14
+ def scan re_or_str
15
+ if sz = bmatch?(re_or_str)
16
+ r = _internal_string.byteslice bytepos, sz
17
+ self.bytepos += sz
18
+ r
19
+ end
20
+ end
21
+
22
+ def skip re_or_str
23
+ if sz = bmatch?(re_or_str)
24
+ self.bytepos += sz
25
+ end
26
+ end
27
+
28
+ def pos= new_pos
29
+ advance new_pos - pos
30
+ end
31
+
32
+ def rest
33
+ _internal_string.byteslice bytepos
34
+ end
35
+
36
+ private :_internal_init, :_internal_string
37
+ end
38
+
39
+ # coding: utf-8
40
+ if __FILE__ == $PROGRAM_NAME
41
+ z = ZScan.new 'ab你好'
42
+ z.push_pos
43
+ z.scan /ab你/
44
+ p z.pos
45
+ p z.bytepos
46
+ z.pop_pos
47
+ end
@@ -0,0 +1,52 @@
1
+ ## Motivation
2
+
3
+ A simple string scanner. Provides... much less methods than `StringScanner`.
4
+
5
+ It supports either string or regexp as scan param.
6
+
7
+ `pos` is by codepoints instead of bytes, use `bytepos` to locate byte position.
8
+
9
+ It provides a position stack for you to efficiently manage scanning locations.
10
+
11
+ It correctly scans anchors. The following codes demonstrate the behavior:
12
+
13
+ ```ruby
14
+ require 'zscan'
15
+ z = ZScan.new 'ab'
16
+ z.pos = 1
17
+ z.scan /(?<a)/ #=> ''
18
+ z.scan /^/ #=> nil
19
+ ```
20
+
21
+ While with `StringScanner`:
22
+
23
+ ```ruby
24
+ require 'strscan'
25
+ s = StringScanner.new 'ab'
26
+ s.pos = 1
27
+ s.scan /(?<a)/ #=> nil
28
+ s.scan /^/ #=> ''
29
+ ```
30
+
31
+ See also https://bugs.ruby-lang.org/issues/7092
32
+
33
+ ## Methods
34
+
35
+ - `ZScan.new string, dup=false`
36
+ - `scan regexp_or_string`
37
+ - `skip regexp_or_string`
38
+ - `bmatch? regexp_or_string` returns length of matched bytes or nil
39
+ - `eos?`
40
+ - `string` note: returns a COW dup
41
+ - `rest`
42
+
43
+ ## Position management
44
+
45
+ - `pos`
46
+ - `pos= new_pos` note: complexity ~ `new_pos > pos ? new_pos - pos : new_pos`.
47
+ - `bytepos`
48
+ - `bytepos= new_bytepos` note: complexity ~ `abs(new_bytepos - bytepos)`.
49
+ - `advance n` move forward `n` codepoints, if `n < 0`, move backward. Stops at beginning or end.
50
+ - `push_pos` efficiently pushes current pos into the stack.
51
+ - `pop_pos` efficiently sets current pos to top of the stack, and pops it.
52
+ - `drop_top` drops top of pos stack without changing current pos.
@@ -0,0 +1,16 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "zscan"
3
+ s.version = "0.1"
4
+ s.author = "Zete Lui"
5
+ s.homepage = "https://github.com/luikore/zscan"
6
+ s.platform = Gem::Platform::RUBY
7
+ s.summary = "improved string scanner"
8
+ s.description = "improved string scanner"
9
+ s.required_ruby_version = ">=1.9.2"
10
+
11
+ s.files = %w"readme.md lib/zscan.rb ext/zscan.c ext/extconf.rb zscan.gemspec"
12
+ s.require_paths = ["lib"]
13
+ s.extensions = ["ext/extconf.rb"]
14
+ s.rubygems_version = '1.8.24'
15
+ s.has_rdoc = false
16
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zscan
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Zete Lui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-05-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: improved string scanner
14
+ email:
15
+ executables: []
16
+ extensions:
17
+ - ext/extconf.rb
18
+ extra_rdoc_files: []
19
+ files:
20
+ - readme.md
21
+ - lib/zscan.rb
22
+ - ext/zscan.c
23
+ - ext/extconf.rb
24
+ - zscan.gemspec
25
+ homepage: https://github.com/luikore/zscan
26
+ licenses: []
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 1.9.2
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.0.3
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: improved string scanner
48
+ test_files: []