unicode_scanner 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.3@scanner --create
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source :rubygems
2
+
3
+ group :development do
4
+ gem 'rspec'
5
+
6
+ gem 'redcarpet'
7
+ gem 'yard'
8
+
9
+ gem 'bundler'
10
+ gem 'jeweler'
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.3)
5
+ git (1.2.5)
6
+ jeweler (1.8.4)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rdoc
11
+ json (1.7.3)
12
+ rake (0.9.2.2)
13
+ rdoc (3.12)
14
+ json (~> 1.4)
15
+ redcarpet (2.1.1)
16
+ rspec (2.11.0)
17
+ rspec-core (~> 2.11.0)
18
+ rspec-expectations (~> 2.11.0)
19
+ rspec-mocks (~> 2.11.0)
20
+ rspec-core (2.11.0)
21
+ rspec-expectations (2.11.1)
22
+ diff-lcs (~> 1.1.3)
23
+ rspec-mocks (2.11.1)
24
+ yard (0.8.2.1)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bundler
31
+ jeweler
32
+ redcarpet
33
+ rspec
34
+ yard
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Tim Morgan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ Unicode String Scanner
2
+ ======================
3
+
4
+ A Unicode-aware implementation of Ruby's `StringScanner`.
5
+
6
+ | | |
7
+ |:------------|:--------------------------------|
8
+ | **Author** | Tim Morgan |
9
+ | **Version** | 1.0 (Jul 11, 2012) |
10
+ | **License** | Released under the MIT license. |
11
+
12
+ About
13
+ -----
14
+
15
+ Did you know that `StringScanner` splits codepoints? Neither did I. This one
16
+ doesn't.
17
+
18
+ **When would I want to use this?** When you want to use `StringScanner` on a
19
+ Unicode (UTF-_n_) string.
20
+
21
+ **When would I _not_ want to use this?** If you're interested in speed. This is
22
+ slower than StringScanner because a) it's not written in native C, and b) it's
23
+ slower to traverse Unicode strings anyway because characters can have varying
24
+ byte sizes.
25
+
26
+ Installation
27
+ ------------
28
+
29
+ Simply add this gem to your project's `Gemfile`:
30
+
31
+ ```` ruby
32
+ gem 'unicode_scanner'
33
+ ````
34
+
35
+ Usage
36
+ -----
37
+
38
+ The `UnicodeScanner` object responds to exactly the same API as
39
+ [StringScanner](http://ruby-doc.org/stdlib-1.9.3/libdoc/strscan/rdoc/StringScanner.html),
40
+ with the exception of the following methods:
41
+
42
+ * `getbyte`
43
+ * any obsolete methods
44
+
45
+ For more information, see the {UnicodeScanner} class documentation.
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "unicode_scanner"
18
+ gem.homepage = "http://github.com/RISCfuture/unicode_scanner"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Unicode-aware implementation of StringScanner}
21
+ gem.description = %Q{An implementation of StringScanner that doesn't split multibyte characters.}
22
+ gem.email = "git@timothymorgan.info"
23
+ gem.authors = ["Tim Morgan"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ task default: :spec
35
+
36
+ require 'yard'
37
+
38
+ # bring sexy back (sexy == tables)
39
+ module YARD::Templates::Helpers::HtmlHelper
40
+ def html_markup_markdown(text)
41
+ markup_class(:markdown).new(text, :gh_blockcode, :fenced_code, :autolink, :tables).to_html
42
+ end
43
+ end
44
+
45
+ YARD::Rake::YardocTask.new('doc') do |doc|
46
+ doc.options << '-m' << 'markdown' << '-M' << 'redcarpet'
47
+ doc.options << '--protected' << '--no-private'
48
+ doc.options << '-r' << 'README.md'
49
+ doc.options << '-o' << 'doc'
50
+ doc.options << '--title' << 'Unicode String Scanner Documentation'
51
+
52
+ doc.files = %w( lib/**/* README.md )
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,655 @@
1
+ # UnicodeScanner provides for Unicode-aware lexical scanning operations on a
2
+ # `String`. Here is an example of its usage:
3
+ #
4
+ # ```` ruby
5
+ # s = UnicodeScanner.new('This is an example string')
6
+ # s.eos? # -> false
7
+ #
8
+ # p s.scan(/\w+/) # -> "This"
9
+ # p s.scan(/\w+/) # -> nil
10
+ # p s.scan(/\s+/) # -> " "
11
+ # p s.scan(/\s+/) # -> nil
12
+ # p s.scan(/\w+/) # -> "is"
13
+ # s.eos? # -> false
14
+ #
15
+ # p s.scan(/\s+/) # -> " "
16
+ # p s.scan(/\w+/) # -> "an"
17
+ # p s.scan(/\s+/) # -> " "
18
+ # p s.scan(/\w+/) # -> "example"
19
+ # p s.scan(/\s+/) # -> " "
20
+ # p s.scan(/\w+/) # -> "string"
21
+ # s.eos? # -> true
22
+ #
23
+ # p s.scan(/\s+/) # -> nil
24
+ # p s.scan(/\w+/) # -> nil
25
+ # ````
26
+ #
27
+ # Scanning a string means remembering the position of a _scan pointer_, which is
28
+ # just an index. The point of scanning is to move forward a bit at a time, so
29
+ # matches are sought after the scan pointer; usually immediately after it.
30
+ #
31
+ # Given the string "test string", here are the pertinent scan pointer positions:
32
+ #
33
+ # ````
34
+ # t e s t s t r i n g
35
+ # 0 1 2 ... 1
36
+ # 0
37
+ # ````
38
+ #
39
+ # When you {#scan} for a pattern (a regular expression), the match must occur at
40
+ # the character after the scan pointer. If you use {#scan_until}, then the
41
+ # match can occur anywhere after the scan pointer. In both cases, the scan
42
+ # pointer moves _just beyond_ the last character of the match, ready to scan
43
+ # again from the next character onwards. This is demonstrated by the example
44
+ # above.
45
+ #
46
+ # Method Categories
47
+ # -----------------
48
+ #
49
+ # There are other methods besides the plain scanners. You can look ahead in the
50
+ # string without actually scanning. You can access the most recent match. You
51
+ # can modify the string being scanned, reset or terminate the scanner, find out
52
+ # or change the position of the scan pointer, skip ahead, and so on.
53
+ #
54
+ # ### Advancing the Scan Pointer
55
+ #
56
+ # - {#getch}
57
+ # - {#scan}
58
+ # - {#scan_until}
59
+ # - {#skip}
60
+ # - {#skip_until}
61
+ #
62
+ # ### Looking Ahead
63
+ #
64
+ # - {#check}
65
+ # - {#check_until}
66
+ # - {#exist?}
67
+ # - {#match?}
68
+ # - {#peek}
69
+ #
70
+ # ### Finding Where we Are
71
+ #
72
+ # - {#beginning_of_line?} ({#bol?})
73
+ # - {#eos?}
74
+ # - {#rest_size}
75
+ # - {#pos}
76
+ #
77
+ # ### Setting Where we Are
78
+ #
79
+ # - {#reset}
80
+ # - {#terminate}
81
+ # - {#pos=}
82
+ #
83
+ # ### Match Data
84
+ #
85
+ # - {#matched}
86
+ # - {#matched?}
87
+ # - {#matched_size}
88
+ # - {#[]}
89
+ # - {#pre_match}
90
+ # - {#post_match}
91
+ #
92
+ # ### Miscellaneous
93
+ #
94
+ # - {#<<}
95
+ # - {#concat}
96
+ # - {#string}
97
+ # - {#string=}
98
+ # - {#unscan}
99
+ #
100
+ # There are aliases to several of the methods.
101
+
102
+ class UnicodeScanner
103
+ INSPECT_LENGTH = 5
104
+
105
+ # Creates a new UnicodeScanner object to scan over the given `string`.
106
+ #
107
+ # @param [String] string The string to iterate over.
108
+
109
+ def initialize(string)
110
+ @string = string
111
+ @matches = nil
112
+ @matched = false
113
+ @current = 0
114
+ @previous = 0
115
+ end
116
+
117
+ # Appends `str` to the string being scanned. This method does not affect scan
118
+ # pointer.
119
+ #
120
+ # @param [String] str The string to append.
121
+ #
122
+ # @example
123
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
124
+ # s.scan(/Fri /)
125
+ # s << " +1000 GMT"
126
+ # s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT"
127
+ # s.scan(/Dec/) # -> "Dec"
128
+
129
+ def concat(str)
130
+ @string.concat str
131
+ end
132
+
133
+ alias << concat
134
+
135
+ # Return the <i>n</i>th subgroup in the most recent match.
136
+ #
137
+ # @param [Fixnum] n The index of the subgroup to return.
138
+ # @return [String, nil] The subgroup, if it exists.
139
+ #
140
+ # @example
141
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
142
+ # s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
143
+ # s[0] # -> "Fri Dec 12 "
144
+ # s[1] # -> "Fri"
145
+ # s[2] # -> "Dec"
146
+ # s[3] # -> "12"
147
+ # s.post_match # -> "1975 14:39"
148
+ # s.pre_match # -> ""
149
+
150
+ def [](n)
151
+ @matched ? @matches[n] : nil
152
+ end
153
+
154
+ # @return [true, false] `true` iff the scan pointer is at the beginning of the
155
+ # line.
156
+ #
157
+ # @example
158
+ # s = UnicodeScanner.new("test\ntest\n")
159
+ # s.bol? # => true
160
+ # s.scan(/te/)
161
+ # s.bol? # => false
162
+ # s.scan(/st\n/)
163
+ # s.bol? # => true
164
+ # s.terminate
165
+ # s.bol? # => true
166
+
167
+ def beginning_of_line?
168
+ return nil if @current > @string.length
169
+ return true if @current == 0
170
+ return @string[@current - 1] == "\n"
171
+ end
172
+
173
+ alias bol? beginning_of_line?
174
+
175
+ # This returns the value that {#scan} would return, without advancing the scan
176
+ # pointer. The match register is affected, though.
177
+ #
178
+ # Mnemonic: it "checks" to see whether a {#scan} will return a value.
179
+ #
180
+ # @param [Regexp] pattern The pattern to scan for.
181
+ # @return [String, nil] The matched segment, if matched.
182
+ #
183
+ # @example
184
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
185
+ # s.check /Fri/ # -> "Fri"
186
+ # s.pos # -> 0
187
+ # s.matched # -> "Fri"
188
+ # s.check /12/ # -> nil
189
+ # s.matched # -> nil
190
+
191
+ def check(pattern)
192
+ do_scan pattern, false, true, true
193
+ end
194
+
195
+ # This returns the value that {#scan_until} would return, without advancing
196
+ # the scan pointer. The match register is affected, though.
197
+ #
198
+ # Mnemonic: it "checks" to see whether a {#scan_until} will return a value.
199
+ #
200
+ # @param [Regexp] pattern The pattern to scan until reaching.
201
+ # @return [String, nil] The matched segment, if matched.
202
+ #
203
+ # @example
204
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
205
+ # s.check_until /12/ # -> "Fri Dec 12"
206
+ # s.pos # -> 0
207
+ # s.matched # -> 12
208
+
209
+ def check_until(pattern)
210
+ do_scan pattern, false, true, false
211
+ end
212
+
213
+ # @return [true, false] `true` if the scan pointer is at the end of the string.
214
+ #
215
+ # @example
216
+ # s = UnicodeScanner.new('test string')
217
+ # p s.eos? # => false
218
+ # s.scan(/test/)
219
+ # p s.eos? # => false
220
+ # s.terminate
221
+ # p s.eos? # => true
222
+
223
+ def eos?
224
+ @current >= @string.length
225
+ end
226
+
227
+ # Looks _ahead_ to see if the `pattern` exists _anywhere_ in the string,
228
+ # without advancing the scan pointer. This predicates whether a {#scan_until}
229
+ # will return a value.
230
+ #
231
+ # @param [Regexp] pattern The pattern to search for.
232
+ # @return [true, false] Whether the pattern exists ahead.
233
+ #
234
+ # @example
235
+ # s = UnicodeScanner.new('test string')
236
+ # s.exist? /s/ # -> 3
237
+ # s.scan /test/ # -> "test"
238
+ # s.exist? /s/ # -> 2
239
+ # s.exist? /e/ # -> nil
240
+
241
+ def exist?(pattern)
242
+ do_scan pattern, false, false, false
243
+ end
244
+
245
+ # Scans one character and returns it.
246
+ #
247
+ # @return [String] The character.
248
+ #
249
+ # @example
250
+ # s = UnicodeScanner.new("ab")
251
+ # s.getch # => "a"
252
+ # s.getch # => "b"
253
+ # s.getch # => nil
254
+ #
255
+ # $KCODE = 'EUC'
256
+ # s = UnicodeScanner.new("\2244\2242")
257
+ # s.getch # => "\244\242" # Japanese hira-kana "A" in EUC-JP
258
+ # s.getch # => nil
259
+
260
+ def getch
261
+ return nil if eos?
262
+ do_scan /./u, true, true, true
263
+ end
264
+
265
+ # Returns a string that represents the UnicodeScanner object, showing:
266
+ #
267
+ # * the current position
268
+ # * the size of the string
269
+ # * the characters surrounding the scan pointer
270
+ #
271
+ # @return [String] A description of this object.
272
+ #
273
+ # @example
274
+ # s = ::new("Fri Dec 12 1975 14:39")
275
+ # s.inspect # -> '#<UnicodeScanner 0/21 @ "Fri D...">'
276
+ # s.scan_until /12/ # -> "Fri Dec 12"
277
+ # s.inspect # -> '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
278
+
279
+ def inspect
280
+ return "#<#{self.class.to_s} (uninitialized)>" if @string.nil?
281
+ return "#<#{self.class.to_s} fin>" if eos?
282
+
283
+ if @current == 0
284
+ return "#<%s %d/%d @ %s>" % [self.class.to_s, @current, @string.length, inspect_after.inspect]
285
+ end
286
+
287
+ "#<%s %d/%d %s @ %s>" % [self.class.to_s, @current, @string.length, inspect_before.inspect, inspect_after.inspect]
288
+ end
289
+
290
+ # Tests whether the given `pattern` is matched from the current scan pointer.
291
+ # Returns the length of the match, or `nil`. The scan pointer is not advanced.
292
+ #
293
+ # @param [Regexp] pattern The pattern to match with.
294
+ # @return [true, false] Whether the pattern is matched from the scan pointer.
295
+ #
296
+ # @example
297
+ # s = UnicodeScanner.new('test string')
298
+ # p s.match?(/\w+/) # -> 4
299
+ # p s.match?(/\w+/) # -> 4
300
+ # p s.match?(/\s+/) # -> nil
301
+
302
+ def match?(pattern)
303
+ do_scan pattern, false, false, true
304
+ end
305
+
306
+ # @return [String, nil] The last matched string.
307
+ # @example
308
+ # s = UnicodeScanner.new('test string')
309
+ # s.match?(/\w+/) # -> 4
310
+ # s.matched # -> "test"
311
+
312
+ def matched
313
+ return nil unless @matched
314
+ @matches[0]
315
+ end
316
+
317
+ # @return [true, false] `true` iff the last match was successful.
318
+ # @example
319
+ # s = UnicodeScanner.new('test string')
320
+ # s.match?(/\w+/) # => 4
321
+ # s.matched? # => true
322
+ # s.match?(/\d+/) # => nil
323
+ # s.matched? # => false
324
+
325
+ def matched?() @matched end
326
+
327
+ # @return [Fixnum, nil] The size of the most recent match (see {#matched}), or
328
+ # `nil` if there was no recent match.
329
+ # @example
330
+ # s = UnicodeScanner.new('test string')
331
+ # s.check /\w+/ # -> "test"
332
+ # s.matched_size # -> 4
333
+ # s.check /\d+/ # -> nil
334
+ # s.matched_size # -> nil
335
+
336
+ def matched_size
337
+ return nil unless @matched
338
+ @matches.end(0) - @matches.begin(0)
339
+ end
340
+
341
+ # Extracts a string corresponding to `string[pos,len]`, without advancing the
342
+ # scan pointer.
343
+ #
344
+ # @param [Fixnum] len The number of characters ahead to peek.
345
+ # @return [String] The string after the current position.
346
+ #
347
+ # @example
348
+ # s = UnicodeScanner.new('test string')
349
+ # s.peek(7) # => "test st"
350
+ # s.peek(7) # => "test st"
351
+
352
+ def peek(len)
353
+ return '' if eos?
354
+ @string[@current, len]
355
+ end
356
+
357
+ # Returns the byte position of the scan pointer. In the 'reset' position, this
358
+ # value is zero. In the 'terminated' position (i.e. the string is exhausted),
359
+ # this value is the bytesize of the string.
360
+ #
361
+ # In short, it's a 0-based index into the string.
362
+ #
363
+ # @return [Fixnum] The current scan position.
364
+ #
365
+ # @example
366
+ # s = UnicodeScanner.new('test string')
367
+ # s.pos # -> 0
368
+ # s.scan_until /str/ # -> "test str"
369
+ # s.pos # -> 8
370
+ # s.terminate # -> #<UnicodeScanner fin>
371
+ # s.pos # -> 11
372
+
373
+ def pos() @current end
374
+
375
+ alias pointer pos
376
+
377
+ # Set the byte position of the scan pointer.
378
+ #
379
+ # @param [Fixnum] n The new position.
380
+ #
381
+ # @example
382
+ # s = UnicodeScanner.new('test string')
383
+ # s.pos = 7 # -> 7
384
+ # s.rest # -> "ring"
385
+
386
+ def pos=(n)
387
+ n += @string.length if n < 0
388
+ raise RangeError, "index out of range" if n < 0
389
+ raise RangeError, "index out of range" if n > @string.length
390
+ @current = n
391
+ end
392
+
393
+ # @return [String] The _**post**-match_ (in the regular expression sense) of
394
+ # the last scan.
395
+ # @example
396
+ # s = UnicodeScanner.new('test string')
397
+ # s.scan(/\w+/) # -> "test"
398
+ # s.scan(/\s+/) # -> " "
399
+ # s.pre_match # -> "test"
400
+ # s.post_match # -> "string"
401
+
402
+ def post_match
403
+ return nil unless @matched
404
+ @string[@previous + @matches.end(0), @string.length]
405
+ end
406
+
407
+ # @return [String] The _**pre**-match_ (in the regular expression sense) of
408
+ # the last scan.
409
+ # @example
410
+ # s = UnicodeScanner.new('test string')
411
+ # s.scan(/\w+/) # -> "test"
412
+ # s.scan(/\s+/) # -> " "
413
+ # s.pre_match # -> "test"
414
+ # s.post_match # -> "string"
415
+
416
+ def pre_match
417
+ return nil unless @matched
418
+ @string[0, @previous + @matches.begin(0)]
419
+ end
420
+
421
+ # Reset the scan pointer (index 0) and clear matching data.
422
+
423
+ def reset
424
+ @current = 0
425
+ @matched = false
426
+ end
427
+
428
+ # @return [String] The "rest" of the string (i.e. everything after the scan
429
+ # pointer). If there is no more data (`eos? = true`), it returns `""`.
430
+
431
+ def rest
432
+ return '' if eos?
433
+ return @string[@current, @string.length]
434
+ end
435
+
436
+ # @return [Fixnum] The value returned by `s.rest.size`.
437
+
438
+ def rest_size
439
+ return 0 if eos?
440
+ @string.length - @current
441
+ end
442
+
443
+ # Tries to match with `pattern` at the current position. If there's a match,
444
+ # the scanner advances the "scan pointer" and returns the matched string.
445
+ # Otherwise, the scanner returns `nil`.
446
+ #
447
+ # @param [Regexp] pattern The pattern to match.
448
+ # @return [String, nil] The string that was matched, if a match was found.
449
+ #
450
+ # @example
451
+ # s = UnicodeScanner.new('test string')
452
+ # p s.scan(/\w+/) # -> "test"
453
+ # p s.scan(/\w+/) # -> nil
454
+ # p s.scan(/\s+/) # -> " "
455
+ # p s.scan(/\w+/) # -> "string"
456
+ # p s.scan(/./) # -> nil
457
+
458
+ def scan(pattern)
459
+ do_scan pattern, true, true, true
460
+ end
461
+
462
+ # Tests whether the given `pattern` is matched from the current scan pointer.
463
+ # Advances the scan pointer if `advance_pointer` is `true`. Returns the
464
+ # matched string if `return_string` is true. The match register is affected.
465
+ #
466
+ # "full" means "scan with full parameters".
467
+ #
468
+ # @param [Regexp] pattern The pattern to scan.
469
+ # @param [true, false] advance_pointer Whether to advance the scan pointer if
470
+ # a match is found.
471
+ # @param [true, false] return_string Whether to return the matched segment.
472
+ # @return [String, Fixnum, nil] The matched segment if `return_string` is
473
+ # `true`, otherwise the number of characters advanced. `nil` if nothing
474
+ # matched.
475
+
476
+ def scan_full(pattern, advance_pointer, return_string)
477
+ do_scan pattern, advance_pointer, return_string, true
478
+ end
479
+
480
+ # Scans the string _until_ the `pattern` is matched. Returns the substring up
481
+ # to and including the end of the match, advancing the scan pointer to that
482
+ # location. If there is no match, `nil` is returned.
483
+ #
484
+ # @param [Regexp] pattern The pattern to match.
485
+ # @return [String, nil] The segment that matched.
486
+ #
487
+ # @example
488
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
489
+ # s.scan_until(/1/) # -> "Fri Dec 1"
490
+ # s.pre_match # -> "Fri Dec "
491
+ # s.scan_until(/XYZ/) # -> nil
492
+
493
+ def scan_until(pattern)
494
+ do_scan pattern, true, true, false
495
+ end
496
+
497
+ # Scans the string `until` the pattern is matched. Advances the scan pointer
498
+ # if `advance_pointer`, otherwise not. Returns the matched string if
499
+ # `return_string` is `true`, otherwise returns the number of characters
500
+ # advanced. This method does affect the match register.
501
+ #
502
+ # @param [Regexp] pattern The pattern to scan.
503
+ # @param [true, false] advance_pointer Whether to advance the scan pointer if
504
+ # a match is found.
505
+ # @param [true, false] return_string Whether to return the matched segment.
506
+ # @return [String, Fixnum, nil] The matched segment if `return_string` is
507
+ # `true`, otherwise the number of characters advanced. `nil` if nothing
508
+ # matched.
509
+
510
+ def search_full(pattern, advance_pointer, return_string)
511
+ do_scan pattern, advance_pointer, return_string, false
512
+ end
513
+
514
+ # Attempts to skip over the given `pattern` beginning with the scan pointer.
515
+ # If it matches, the scan pointer is advanced to the end of the match, and the
516
+ # length of the match is returned. Otherwise, `nil` is returned.
517
+ #
518
+ # It's similar to {#scan}, but without returning the matched string.
519
+ #
520
+ # @param [Regexp] pattern The pattern to match.
521
+ # @return [Fixnum, nil] The number of characters advanced, if matched.
522
+ #
523
+ # @example
524
+ # s = UnicodeScanner.new('test string')
525
+ # p s.skip(/\w+/) # -> 4
526
+ # p s.skip(/\w+/) # -> nil
527
+ # p s.skip(/\s+/) # -> 1
528
+ # p s.skip(/\w+/) # -> 6
529
+ # p s.skip(/./) # -> nil
530
+
531
+ def skip(pattern)
532
+ do_scan pattern, true, false, true
533
+ end
534
+
535
+ # Advances the scan pointer until `pattern` is matched and consumed. Returns
536
+ # the number of characters advanced, or `nil` if no match was found.
537
+ #
538
+ # Look ahead to match `pattern`, and advance the scan pointer to the _end_ of
539
+ # the match. Return the number of characters advanced, or `nil` if the match
540
+ # was unsuccessful.
541
+ #
542
+ # It's similar to {#scan_until}, but without returning the intervening string.
543
+ #
544
+ # @param [Regexp] pattern The pattern to match.
545
+ # @return [Fixnum, nil] The number of characters advanced, if matched.
546
+
547
+ def skip_until(pattern)
548
+ do_scan pattern, true, false, false
549
+ end
550
+
551
+ # @return [String] The string being scanned.
552
+
553
+ def string() @string end
554
+
555
+ # Changes the string being scanned to `str` and resets the scanner.
556
+ #
557
+ # @param [String] str The new string to scan.
558
+ # @return [String] `str`
559
+
560
+ def string=(str)
561
+ @string = str
562
+ @matched = false
563
+ @current = 0
564
+ str
565
+ end
566
+
567
+ # Set the scan pointer to the end of the string and clear matching data.
568
+
569
+ def terminate
570
+ @current = @string.length
571
+ @matched = false
572
+ self
573
+ end
574
+ alias clear terminate
575
+
576
+ # Set the scan pointer to the previous position. Only one previous position is
577
+ # remembered, and it changes with each scanning operation.
578
+ #
579
+ # @example
580
+ # s = UnicodeScanner.new('test string')
581
+ # s.scan(/\w+/) # => "test"
582
+ # s.unscan
583
+ # s.scan(/../) # => "te"
584
+ # s.scan(/\d/) # => nil
585
+ # s.unscan # ScanError: unscan failed: previous match record not exist
586
+
587
+ def unscan
588
+ raise ScanError, "unscan failed: previous match record not exist" unless @matched
589
+ @current = @previous
590
+ @matched = false
591
+ self
592
+ end
593
+
594
+ private
595
+
596
+ def do_scan(regex, advance_pointer, return_string, head_only)
597
+ raise ArgumentError unless regex.kind_of?(Regexp)
598
+
599
+ @matched = false
600
+ return nil if eos?
601
+
602
+ @matches = regex.match(@string[@current, @string.length])
603
+ return nil unless @matches
604
+
605
+ if head_only && @matches.begin(0) > 0
606
+ @matches = nil
607
+ return nil
608
+ end
609
+
610
+ @matched = true
611
+
612
+ @previous = @current
613
+ @current += @matches.end(0) if advance_pointer
614
+ if return_string
615
+ return @string[@previous, @matches.end(0)]
616
+ else
617
+ return @matches.end(0)
618
+ end
619
+ end
620
+
621
+ def inspect_before # inspect1
622
+ return '' if @current == 0
623
+
624
+ str = String.new
625
+ len = 0
626
+
627
+ if @current > INSPECT_LENGTH
628
+ str << '...'
629
+ len = INSPECT_LENGTH
630
+ else
631
+ len = @current
632
+ end
633
+
634
+ str << @string[@current - len, len]
635
+ return str
636
+ end
637
+
638
+ def inspect_after # inspect2
639
+ return '' if eos?
640
+
641
+ str = String.new
642
+ len = @string.length - @current
643
+ if len > INSPECT_LENGTH
644
+ len = INSPECT_LENGTH
645
+ str << @string[@current, len]
646
+ str << '...'
647
+ else
648
+ str << @string[@current, len]
649
+ end
650
+
651
+ return str
652
+ end
653
+ end
654
+
655
+ class ScanError < StandardError; end
@@ -0,0 +1,12 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'unicode_scanner'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
9
+
10
+ RSpec.configure do |config|
11
+
12
+ end
@@ -0,0 +1,206 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
4
+
5
+ describe UnicodeScanner do
6
+ it "should pass all the class-level examples" do
7
+ s = UnicodeScanner.new('This is an example string')
8
+ s.eos?.should == false
9
+
10
+ s.scan(/\w+/).should == "This"
11
+ s.scan(/\w+/).should == nil
12
+ s.scan(/\s+/).should == " "
13
+ s.scan(/\s+/).should == nil
14
+ s.scan(/\w+/).should == "is"
15
+ s.eos?.should == false
16
+
17
+ s.scan(/\s+/).should == " "
18
+ s.scan(/\w+/).should == "an"
19
+ s.scan(/\s+/).should == " "
20
+ s.scan(/\w+/).should == "example"
21
+ s.scan(/\s+/).should == " "
22
+ s.scan(/\w+/).should == "string"
23
+ s.eos?.should == true
24
+
25
+ s.scan(/\s+/).should == nil
26
+ s.scan(/\w+/).should == nil
27
+ end
28
+
29
+ it "should pass the #concat example" do
30
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
31
+ s.scan(/Fri /)
32
+ s << " +1000 GMT"
33
+ s.string.should == "Fri Dec 12 1975 14:39 +1000 GMT"
34
+ s.scan(/Dec/).should == "Dec"
35
+ end
36
+
37
+ it "should pass the #[] example" do
38
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
39
+ s.scan(/(\w+) (\w+) (\d+) /).should == "Fri Dec 12 "
40
+ s[0].should == "Fri Dec 12 "
41
+ s[1].should == "Fri"
42
+ s[2].should == "Dec"
43
+ s[3].should == "12"
44
+ s.post_match.should == "1975 14:39"
45
+ s.pre_match.should == ""
46
+ end
47
+
48
+ it "should pass the #beginning_of_line? example" do
49
+ s = UnicodeScanner.new("test\ntest\n")
50
+ s.bol?.should == true
51
+ s.scan(/te/)
52
+ s.bol?.should == false
53
+ s.scan(/st\n/)
54
+ s.bol?.should == true
55
+ s.terminate
56
+ s.bol?.should == true
57
+ end
58
+
59
+ it "should pass the #check example" do
60
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
61
+ s.check(/Fri/).should == "Fri"
62
+ s.pos.should == 0
63
+ s.matched.should == "Fri"
64
+ s.check(/12/).should == nil
65
+ s.matched.should == nil
66
+ end
67
+
68
+ it "should pass the #check_until example" do
69
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
70
+ s.check_until(/12/).should == "Fri Dec 12"
71
+ s.pos.should == 0
72
+ s.matched.should == "12"
73
+ end
74
+
75
+ it "should pass the #eos? example" do
76
+ s = UnicodeScanner.new('test string')
77
+ s.eos?.should == false
78
+ s.scan(/test/)
79
+ s.eos?.should == false
80
+ s.terminate
81
+ s.eos?.should == true
82
+ end
83
+
84
+ it "should pass the #exist? example" do
85
+ s = UnicodeScanner.new('test string')
86
+ s.exist?(/s/).should == 3
87
+ s.scan(/test/).should == "test"
88
+ s.exist?(/s/).should == 2
89
+ s.exist?(/e/).should == nil
90
+ end
91
+
92
+ it "should pass a tweaked version of the #getch example" do
93
+ s = UnicodeScanner.new("ab")
94
+ s.getch.should == "a"
95
+ s.getch.should == "b"
96
+ s.getch.should == nil
97
+
98
+ s = UnicodeScanner.new("ぁ")
99
+ s.getch.should == "ぁ" # Japanese hira-kana "A" in EUC-JP
100
+ s.getch.should == nil
101
+ end
102
+
103
+ it "should pass the #inspect example" do
104
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
105
+ s.inspect.should == '#<UnicodeScanner 0/21 @ "Fri D...">'
106
+ s.scan_until(/12/).should == "Fri Dec 12"
107
+ s.inspect.should == '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
108
+ end
109
+
110
+ it "should pass the #match? example" do
111
+ s = UnicodeScanner.new('test string')
112
+ s.match?(/\w+/).should == 4
113
+ s.match?(/\w+/).should == 4
114
+ s.match?(/\s+/).should == nil
115
+ end
116
+
117
+ it "should pass the #matched example" do
118
+ s = UnicodeScanner.new('test string')
119
+ s.match?(/\w+/).should == 4
120
+ s.matched.should == "test"
121
+ end
122
+
123
+ it "should pass the #matched? example" do
124
+ s = UnicodeScanner.new('test string')
125
+ s.match?(/\w+/).should == 4
126
+ s.matched?.should == true
127
+ s.match?(/\d+/).should == nil
128
+ s.matched?.should == false
129
+ end
130
+
131
+ it "should pass the #matched_size example" do
132
+ s = UnicodeScanner.new('test string')
133
+ s.check(/\w+/).should == "test"
134
+ s.matched_size.should == 4
135
+ s.check(/\d+/).should == nil
136
+ s.matched_size.should == nil
137
+ end
138
+
139
+ it "should pass the #peek example" do
140
+ s = UnicodeScanner.new('test string')
141
+ s.peek(7).should == "test st"
142
+ s.peek(7).should == "test st"
143
+ end
144
+
145
+ it "should pass the #pos example" do
146
+ s = UnicodeScanner.new('test string')
147
+ s.pos.should == 0
148
+ s.scan_until(/str/).should == "test str"
149
+ s.pos.should == 8
150
+ s.terminate.inspect.should == "#<UnicodeScanner fin>"
151
+ s.pos.should == 11
152
+ end
153
+
154
+ it "should pass the #pos= example" do
155
+ s = UnicodeScanner.new('test string')
156
+ (s.pos = 7).should == 7
157
+ s.rest.should == "ring"
158
+ end
159
+
160
+ it "should pass the #post_match/#pre_match example" do
161
+ s = UnicodeScanner.new('test string')
162
+ s.scan(/\w+/).should == "test"
163
+ s.scan(/\s+/).should == " "
164
+ s.pre_match.should == "test"
165
+ s.post_match.should == "string"
166
+ end
167
+
168
+ it "should pass the #scan example" do
169
+ s = UnicodeScanner.new('test string')
170
+ s.scan(/\w+/).should == "test"
171
+ s.scan(/\w+/).should == nil
172
+ s.scan(/\s+/).should == " "
173
+ s.scan(/\w+/).should == "string"
174
+ s.scan(/./).should == nil
175
+ end
176
+
177
+ it "should pass the #scan_until example" do
178
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
179
+ s.scan_until(/1/).should == "Fri Dec 1"
180
+ s.pre_match.should == "Fri Dec "
181
+ s.scan_until(/XYZ/).should == nil
182
+ end
183
+
184
+ it "should pass the #skip example" do
185
+ s = UnicodeScanner.new('test string')
186
+ s.skip(/\w+/).should == 4
187
+ s.skip(/\w+/).should == nil
188
+ s.skip(/\s+/).should == 1
189
+ s.skip(/\w+/).should == 6
190
+ s.skip(/./).should == nil
191
+ end
192
+
193
+ it "should pass the half-finished #skip_until example" do
194
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
195
+ s.skip_until(/12/).should == 10
196
+ end
197
+
198
+ it "should pass the #unscan example" do
199
+ s = UnicodeScanner.new('test string')
200
+ s.scan(/\w+/).should == "test"
201
+ s.unscan
202
+ s.scan(/../).should == "te"
203
+ s.scan(/\d/).should == nil
204
+ -> { s.unscan }.should raise_error(ScanError, 'unscan failed: previous match record not exist')
205
+ end
206
+ end
@@ -0,0 +1,64 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "unicode_scanner"
8
+ s.version = "1.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Tim Morgan"]
12
+ s.date = "2012-07-12"
13
+ s.description = "An implementation of StringScanner that doesn't split multibyte characters."
14
+ s.email = "git@timothymorgan.info"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".rvmrc",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.md",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "lib/unicode_scanner.rb",
30
+ "spec/spec_helper.rb",
31
+ "spec/unicode_scanner_spec.rb",
32
+ "unicode_scanner.gemspec"
33
+ ]
34
+ s.homepage = "http://github.com/RISCfuture/unicode_scanner"
35
+ s.licenses = ["MIT"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = "1.8.24"
38
+ s.summary = "Unicode-aware implementation of StringScanner"
39
+
40
+ if s.respond_to? :specification_version then
41
+ s.specification_version = 3
42
+
43
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
44
+ s.add_development_dependency(%q<rspec>, [">= 0"])
45
+ s.add_development_dependency(%q<redcarpet>, [">= 0"])
46
+ s.add_development_dependency(%q<yard>, [">= 0"])
47
+ s.add_development_dependency(%q<bundler>, [">= 0"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ else
50
+ s.add_dependency(%q<rspec>, [">= 0"])
51
+ s.add_dependency(%q<redcarpet>, [">= 0"])
52
+ s.add_dependency(%q<yard>, [">= 0"])
53
+ s.add_dependency(%q<bundler>, [">= 0"])
54
+ s.add_dependency(%q<jeweler>, [">= 0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<rspec>, [">= 0"])
58
+ s.add_dependency(%q<redcarpet>, [">= 0"])
59
+ s.add_dependency(%q<yard>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, [">= 0"])
61
+ s.add_dependency(%q<jeweler>, [">= 0"])
62
+ end
63
+ end
64
+
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode_scanner
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tim Morgan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: redcarpet
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: yard
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: jeweler
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: An implementation of StringScanner that doesn't split multibyte characters.
95
+ email: git@timothymorgan.info
96
+ executables: []
97
+ extensions: []
98
+ extra_rdoc_files:
99
+ - LICENSE.txt
100
+ - README.md
101
+ files:
102
+ - .document
103
+ - .rspec
104
+ - .rvmrc
105
+ - Gemfile
106
+ - Gemfile.lock
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - VERSION
111
+ - lib/unicode_scanner.rb
112
+ - spec/spec_helper.rb
113
+ - spec/unicode_scanner_spec.rb
114
+ - unicode_scanner.gemspec
115
+ homepage: http://github.com/RISCfuture/unicode_scanner
116
+ licenses:
117
+ - MIT
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ segments:
129
+ - 0
130
+ hash: -3935821298050612576
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 1.8.24
140
+ signing_key:
141
+ specification_version: 3
142
+ summary: Unicode-aware implementation of StringScanner
143
+ test_files: []