unicode_scanner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.3@scanner --create
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source :rubygems
2
+
3
+ group :development do
4
+ gem 'rspec'
5
+
6
+ gem 'redcarpet'
7
+ gem 'yard'
8
+
9
+ gem 'bundler'
10
+ gem 'jeweler'
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.3)
5
+ git (1.2.5)
6
+ jeweler (1.8.4)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rdoc
11
+ json (1.7.3)
12
+ rake (0.9.2.2)
13
+ rdoc (3.12)
14
+ json (~> 1.4)
15
+ redcarpet (2.1.1)
16
+ rspec (2.11.0)
17
+ rspec-core (~> 2.11.0)
18
+ rspec-expectations (~> 2.11.0)
19
+ rspec-mocks (~> 2.11.0)
20
+ rspec-core (2.11.0)
21
+ rspec-expectations (2.11.1)
22
+ diff-lcs (~> 1.1.3)
23
+ rspec-mocks (2.11.1)
24
+ yard (0.8.2.1)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bundler
31
+ jeweler
32
+ redcarpet
33
+ rspec
34
+ yard
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Tim Morgan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ Unicode String Scanner
2
+ ======================
3
+
4
+ A Unicode-aware implementation of Ruby's `StringScanner`.
5
+
6
+ | | |
7
+ |:------------|:--------------------------------|
8
+ | **Author** | Tim Morgan |
9
+ | **Version** | 1.0 (Jul 11, 2012) |
10
+ | **License** | Released under the MIT license. |
11
+
12
+ About
13
+ -----
14
+
15
+ Did you know that `StringScanner` splits codepoints? Neither did I. This one
16
+ doesn't.
17
+
18
+ **When would I want to use this?** When you want to use `StringScanner` on a
19
+ Unicode (UTF-_n_) string.
20
+
21
+ **When would I _not_ want to use this?** If you're interested in speed. This is
22
+ slower than StringScanner because a) it's not written in native C, and b) it's
23
+ slower to traverse Unicode strings anyway because characters can have varying
24
+ byte sizes.
25
+
26
+ Installation
27
+ ------------
28
+
29
+ Simply add this gem to your project's `Gemfile`:
30
+
31
+ ```` ruby
32
+ gem 'unicode_scanner'
33
+ ````
34
+
35
+ Usage
36
+ -----
37
+
38
+ The `UnicodeScanner` object responds to exactly the same API as
39
+ [StringScanner](http://ruby-doc.org/stdlib-1.9.3/libdoc/strscan/rdoc/StringScanner.html),
40
+ with the exception of the following methods:
41
+
42
+ * `getbyte`
43
+ * any obsolete methods
44
+
45
+ For more information, see the {UnicodeScanner} class documentation.
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "unicode_scanner"
18
+ gem.homepage = "http://github.com/RISCfuture/unicode_scanner"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Unicode-aware implementation of StringScanner}
21
+ gem.description = %Q{An implementation of StringScanner that doesn't split multibyte characters.}
22
+ gem.email = "git@timothymorgan.info"
23
+ gem.authors = ["Tim Morgan"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ task default: :spec
35
+
36
+ require 'yard'
37
+
38
+ # bring sexy back (sexy == tables)
39
+ module YARD::Templates::Helpers::HtmlHelper
40
+ def html_markup_markdown(text)
41
+ markup_class(:markdown).new(text, :gh_blockcode, :fenced_code, :autolink, :tables).to_html
42
+ end
43
+ end
44
+
45
+ YARD::Rake::YardocTask.new('doc') do |doc|
46
+ doc.options << '-m' << 'markdown' << '-M' << 'redcarpet'
47
+ doc.options << '--protected' << '--no-private'
48
+ doc.options << '-r' << 'README.md'
49
+ doc.options << '-o' << 'doc'
50
+ doc.options << '--title' << 'Unicode String Scanner Documentation'
51
+
52
+ doc.files = %w( lib/**/* README.md )
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,655 @@
1
+ # UnicodeScanner provides for Unicode-aware lexical scanning operations on a
2
+ # `String`. Here is an example of its usage:
3
+ #
4
+ # ```` ruby
5
+ # s = UnicodeScanner.new('This is an example string')
6
+ # s.eos? # -> false
7
+ #
8
+ # p s.scan(/\w+/) # -> "This"
9
+ # p s.scan(/\w+/) # -> nil
10
+ # p s.scan(/\s+/) # -> " "
11
+ # p s.scan(/\s+/) # -> nil
12
+ # p s.scan(/\w+/) # -> "is"
13
+ # s.eos? # -> false
14
+ #
15
+ # p s.scan(/\s+/) # -> " "
16
+ # p s.scan(/\w+/) # -> "an"
17
+ # p s.scan(/\s+/) # -> " "
18
+ # p s.scan(/\w+/) # -> "example"
19
+ # p s.scan(/\s+/) # -> " "
20
+ # p s.scan(/\w+/) # -> "string"
21
+ # s.eos? # -> true
22
+ #
23
+ # p s.scan(/\s+/) # -> nil
24
+ # p s.scan(/\w+/) # -> nil
25
+ # ````
26
+ #
27
+ # Scanning a string means remembering the position of a _scan pointer_, which is
28
+ # just an index. The point of scanning is to move forward a bit at a time, so
29
+ # matches are sought after the scan pointer; usually immediately after it.
30
+ #
31
+ # Given the string "test string", here are the pertinent scan pointer positions:
32
+ #
33
+ # ````
34
+ # t e s t s t r i n g
35
+ # 0 1 2 ... 1
36
+ # 0
37
+ # ````
38
+ #
39
+ # When you {#scan} for a pattern (a regular expression), the match must occur at
40
+ # the character after the scan pointer. If you use {#scan_until}, then the
41
+ # match can occur anywhere after the scan pointer. In both cases, the scan
42
+ # pointer moves _just beyond_ the last character of the match, ready to scan
43
+ # again from the next character onwards. This is demonstrated by the example
44
+ # above.
45
+ #
46
+ # Method Categories
47
+ # -----------------
48
+ #
49
+ # There are other methods besides the plain scanners. You can look ahead in the
50
+ # string without actually scanning. You can access the most recent match. You
51
+ # can modify the string being scanned, reset or terminate the scanner, find out
52
+ # or change the position of the scan pointer, skip ahead, and so on.
53
+ #
54
+ # ### Advancing the Scan Pointer
55
+ #
56
+ # - {#getch}
57
+ # - {#scan}
58
+ # - {#scan_until}
59
+ # - {#skip}
60
+ # - {#skip_until}
61
+ #
62
+ # ### Looking Ahead
63
+ #
64
+ # - {#check}
65
+ # - {#check_until}
66
+ # - {#exist?}
67
+ # - {#match?}
68
+ # - {#peek}
69
+ #
70
+ # ### Finding Where we Are
71
+ #
72
+ # - {#beginning_of_line?} ({#bol?})
73
+ # - {#eos?}
74
+ # - {#rest_size}
75
+ # - {#pos}
76
+ #
77
+ # ### Setting Where we Are
78
+ #
79
+ # - {#reset}
80
+ # - {#terminate}
81
+ # - {#pos=}
82
+ #
83
+ # ### Match Data
84
+ #
85
+ # - {#matched}
86
+ # - {#matched?}
87
+ # - {#matched_size}
88
+ # - {#[]}
89
+ # - {#pre_match}
90
+ # - {#post_match}
91
+ #
92
+ # ### Miscellaneous
93
+ #
94
+ # - {#<<}
95
+ # - {#concat}
96
+ # - {#string}
97
+ # - {#string=}
98
+ # - {#unscan}
99
+ #
100
+ # There are aliases to several of the methods.
101
+
102
+ class UnicodeScanner
103
+ INSPECT_LENGTH = 5
104
+
105
+ # Creates a new UnicodeScanner object to scan over the given `string`.
106
+ #
107
+ # @param [String] string The string to iterate over.
108
+
109
+ def initialize(string)
110
+ @string = string
111
+ @matches = nil
112
+ @matched = false
113
+ @current = 0
114
+ @previous = 0
115
+ end
116
+
117
+ # Appends `str` to the string being scanned. This method does not affect scan
118
+ # pointer.
119
+ #
120
+ # @param [String] str The string to append.
121
+ #
122
+ # @example
123
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
124
+ # s.scan(/Fri /)
125
+ # s << " +1000 GMT"
126
+ # s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT"
127
+ # s.scan(/Dec/) # -> "Dec"
128
+
129
+ def concat(str)
130
+ @string.concat str
131
+ end
132
+
133
+ alias << concat
134
+
135
+ # Return the <i>n</i>th subgroup in the most recent match.
136
+ #
137
+ # @param [Fixnum] n The index of the subgroup to return.
138
+ # @return [String, nil] The subgroup, if it exists.
139
+ #
140
+ # @example
141
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
142
+ # s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
143
+ # s[0] # -> "Fri Dec 12 "
144
+ # s[1] # -> "Fri"
145
+ # s[2] # -> "Dec"
146
+ # s[3] # -> "12"
147
+ # s.post_match # -> "1975 14:39"
148
+ # s.pre_match # -> ""
149
+
150
+ def [](n)
151
+ @matched ? @matches[n] : nil
152
+ end
153
+
154
+ # @return [true, false] `true` iff the scan pointer is at the beginning of the
155
+ # line.
156
+ #
157
+ # @example
158
+ # s = UnicodeScanner.new("test\ntest\n")
159
+ # s.bol? # => true
160
+ # s.scan(/te/)
161
+ # s.bol? # => false
162
+ # s.scan(/st\n/)
163
+ # s.bol? # => true
164
+ # s.terminate
165
+ # s.bol? # => true
166
+
167
+ def beginning_of_line?
168
+ return nil if @current > @string.length
169
+ return true if @current == 0
170
+ return @string[@current - 1] == "\n"
171
+ end
172
+
173
+ alias bol? beginning_of_line?
174
+
175
+ # This returns the value that {#scan} would return, without advancing the scan
176
+ # pointer. The match register is affected, though.
177
+ #
178
+ # Mnemonic: it "checks" to see whether a {#scan} will return a value.
179
+ #
180
+ # @param [Regexp] pattern The pattern to scan for.
181
+ # @return [String, nil] The matched segment, if matched.
182
+ #
183
+ # @example
184
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
185
+ # s.check /Fri/ # -> "Fri"
186
+ # s.pos # -> 0
187
+ # s.matched # -> "Fri"
188
+ # s.check /12/ # -> nil
189
+ # s.matched # -> nil
190
+
191
+ def check(pattern)
192
+ do_scan pattern, false, true, true
193
+ end
194
+
195
+ # This returns the value that {#scan_until} would return, without advancing
196
+ # the scan pointer. The match register is affected, though.
197
+ #
198
+ # Mnemonic: it "checks" to see whether a {#scan_until} will return a value.
199
+ #
200
+ # @param [Regexp] pattern The pattern to scan until reaching.
201
+ # @return [String, nil] The matched segment, if matched.
202
+ #
203
+ # @example
204
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
205
+ # s.check_until /12/ # -> "Fri Dec 12"
206
+ # s.pos # -> 0
207
+ # s.matched # -> 12
208
+
209
+ def check_until(pattern)
210
+ do_scan pattern, false, true, false
211
+ end
212
+
213
+ # @return [true, false] `true` if the scan pointer is at the end of the string.
214
+ #
215
+ # @example
216
+ # s = UnicodeScanner.new('test string')
217
+ # p s.eos? # => false
218
+ # s.scan(/test/)
219
+ # p s.eos? # => false
220
+ # s.terminate
221
+ # p s.eos? # => true
222
+
223
+ def eos?
224
+ @current >= @string.length
225
+ end
226
+
227
+ # Looks _ahead_ to see if the `pattern` exists _anywhere_ in the string,
228
+ # without advancing the scan pointer. This predicates whether a {#scan_until}
229
+ # will return a value.
230
+ #
231
+ # @param [Regexp] pattern The pattern to search for.
232
+ # @return [true, false] Whether the pattern exists ahead.
233
+ #
234
+ # @example
235
+ # s = UnicodeScanner.new('test string')
236
+ # s.exist? /s/ # -> 3
237
+ # s.scan /test/ # -> "test"
238
+ # s.exist? /s/ # -> 2
239
+ # s.exist? /e/ # -> nil
240
+
241
+ def exist?(pattern)
242
+ do_scan pattern, false, false, false
243
+ end
244
+
245
+ # Scans one character and returns it.
246
+ #
247
+ # @return [String] The character.
248
+ #
249
+ # @example
250
+ # s = UnicodeScanner.new("ab")
251
+ # s.getch # => "a"
252
+ # s.getch # => "b"
253
+ # s.getch # => nil
254
+ #
255
+ # $KCODE = 'EUC'
256
+ # s = UnicodeScanner.new("\2244\2242")
257
+ # s.getch # => "\244\242" # Japanese hira-kana "A" in EUC-JP
258
+ # s.getch # => nil
259
+
260
+ def getch
261
+ return nil if eos?
262
+ do_scan /./u, true, true, true
263
+ end
264
+
265
+ # Returns a string that represents the UnicodeScanner object, showing:
266
+ #
267
+ # * the current position
268
+ # * the size of the string
269
+ # * the characters surrounding the scan pointer
270
+ #
271
+ # @return [String] A description of this object.
272
+ #
273
+ # @example
274
+ # s = ::new("Fri Dec 12 1975 14:39")
275
+ # s.inspect # -> '#<UnicodeScanner 0/21 @ "Fri D...">'
276
+ # s.scan_until /12/ # -> "Fri Dec 12"
277
+ # s.inspect # -> '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
278
+
279
+ def inspect
280
+ return "#<#{self.class.to_s} (uninitialized)>" if @string.nil?
281
+ return "#<#{self.class.to_s} fin>" if eos?
282
+
283
+ if @current == 0
284
+ return "#<%s %d/%d @ %s>" % [self.class.to_s, @current, @string.length, inspect_after.inspect]
285
+ end
286
+
287
+ "#<%s %d/%d %s @ %s>" % [self.class.to_s, @current, @string.length, inspect_before.inspect, inspect_after.inspect]
288
+ end
289
+
290
+ # Tests whether the given `pattern` is matched from the current scan pointer.
291
+ # Returns the length of the match, or `nil`. The scan pointer is not advanced.
292
+ #
293
+ # @param [Regexp] pattern The pattern to match with.
294
+ # @return [true, false] Whether the pattern is matched from the scan pointer.
295
+ #
296
+ # @example
297
+ # s = UnicodeScanner.new('test string')
298
+ # p s.match?(/\w+/) # -> 4
299
+ # p s.match?(/\w+/) # -> 4
300
+ # p s.match?(/\s+/) # -> nil
301
+
302
+ def match?(pattern)
303
+ do_scan pattern, false, false, true
304
+ end
305
+
306
+ # @return [String, nil] The last matched string.
307
+ # @example
308
+ # s = UnicodeScanner.new('test string')
309
+ # s.match?(/\w+/) # -> 4
310
+ # s.matched # -> "test"
311
+
312
+ def matched
313
+ return nil unless @matched
314
+ @matches[0]
315
+ end
316
+
317
+ # @return [true, false] `true` iff the last match was successful.
318
+ # @example
319
+ # s = UnicodeScanner.new('test string')
320
+ # s.match?(/\w+/) # => 4
321
+ # s.matched? # => true
322
+ # s.match?(/\d+/) # => nil
323
+ # s.matched? # => false
324
+
325
+ def matched?() @matched end
326
+
327
+ # @return [Fixnum, nil] The size of the most recent match (see {#matched}), or
328
+ # `nil` if there was no recent match.
329
+ # @example
330
+ # s = UnicodeScanner.new('test string')
331
+ # s.check /\w+/ # -> "test"
332
+ # s.matched_size # -> 4
333
+ # s.check /\d+/ # -> nil
334
+ # s.matched_size # -> nil
335
+
336
+ def matched_size
337
+ return nil unless @matched
338
+ @matches.end(0) - @matches.begin(0)
339
+ end
340
+
341
+ # Extracts a string corresponding to `string[pos,len]`, without advancing the
342
+ # scan pointer.
343
+ #
344
+ # @param [Fixnum] len The number of characters ahead to peek.
345
+ # @return [String] The string after the current position.
346
+ #
347
+ # @example
348
+ # s = UnicodeScanner.new('test string')
349
+ # s.peek(7) # => "test st"
350
+ # s.peek(7) # => "test st"
351
+
352
+ def peek(len)
353
+ return '' if eos?
354
+ @string[@current, len]
355
+ end
356
+
357
+ # Returns the byte position of the scan pointer. In the 'reset' position, this
358
+ # value is zero. In the 'terminated' position (i.e. the string is exhausted),
359
+ # this value is the bytesize of the string.
360
+ #
361
+ # In short, it's a 0-based index into the string.
362
+ #
363
+ # @return [Fixnum] The current scan position.
364
+ #
365
+ # @example
366
+ # s = UnicodeScanner.new('test string')
367
+ # s.pos # -> 0
368
+ # s.scan_until /str/ # -> "test str"
369
+ # s.pos # -> 8
370
+ # s.terminate # -> #<UnicodeScanner fin>
371
+ # s.pos # -> 11
372
+
373
+ def pos() @current end
374
+
375
+ alias pointer pos
376
+
377
+ # Set the byte position of the scan pointer.
378
+ #
379
+ # @param [Fixnum] n The new position.
380
+ #
381
+ # @example
382
+ # s = UnicodeScanner.new('test string')
383
+ # s.pos = 7 # -> 7
384
+ # s.rest # -> "ring"
385
+
386
+ def pos=(n)
387
+ n += @string.length if n < 0
388
+ raise RangeError, "index out of range" if n < 0
389
+ raise RangeError, "index out of range" if n > @string.length
390
+ @current = n
391
+ end
392
+
393
+ # @return [String] The _**post**-match_ (in the regular expression sense) of
394
+ # the last scan.
395
+ # @example
396
+ # s = UnicodeScanner.new('test string')
397
+ # s.scan(/\w+/) # -> "test"
398
+ # s.scan(/\s+/) # -> " "
399
+ # s.pre_match # -> "test"
400
+ # s.post_match # -> "string"
401
+
402
+ def post_match
403
+ return nil unless @matched
404
+ @string[@previous + @matches.end(0), @string.length]
405
+ end
406
+
407
+ # @return [String] The _**pre**-match_ (in the regular expression sense) of
408
+ # the last scan.
409
+ # @example
410
+ # s = UnicodeScanner.new('test string')
411
+ # s.scan(/\w+/) # -> "test"
412
+ # s.scan(/\s+/) # -> " "
413
+ # s.pre_match # -> "test"
414
+ # s.post_match # -> "string"
415
+
416
+ def pre_match
417
+ return nil unless @matched
418
+ @string[0, @previous + @matches.begin(0)]
419
+ end
420
+
421
+ # Reset the scan pointer (index 0) and clear matching data.
422
+
423
+ def reset
424
+ @current = 0
425
+ @matched = false
426
+ end
427
+
428
+ # @return [String] The "rest" of the string (i.e. everything after the scan
429
+ # pointer). If there is no more data (`eos? = true`), it returns `""`.
430
+
431
+ def rest
432
+ return '' if eos?
433
+ return @string[@current, @string.length]
434
+ end
435
+
436
+ # @return [Fixnum] The value returned by `s.rest.size`.
437
+
438
+ def rest_size
439
+ return 0 if eos?
440
+ @string.length - @current
441
+ end
442
+
443
+ # Tries to match with `pattern` at the current position. If there's a match,
444
+ # the scanner advances the "scan pointer" and returns the matched string.
445
+ # Otherwise, the scanner returns `nil`.
446
+ #
447
+ # @param [Regexp] pattern The pattern to match.
448
+ # @return [String, nil] The string that was matched, if a match was found.
449
+ #
450
+ # @example
451
+ # s = UnicodeScanner.new('test string')
452
+ # p s.scan(/\w+/) # -> "test"
453
+ # p s.scan(/\w+/) # -> nil
454
+ # p s.scan(/\s+/) # -> " "
455
+ # p s.scan(/\w+/) # -> "string"
456
+ # p s.scan(/./) # -> nil
457
+
458
+ def scan(pattern)
459
+ do_scan pattern, true, true, true
460
+ end
461
+
462
+ # Tests whether the given `pattern` is matched from the current scan pointer.
463
+ # Advances the scan pointer if `advance_pointer` is `true`. Returns the
464
+ # matched string if `return_string` is true. The match register is affected.
465
+ #
466
+ # "full" means "scan with full parameters".
467
+ #
468
+ # @param [Regexp] pattern The pattern to scan.
469
+ # @param [true, false] advance_pointer Whether to advance the scan pointer if
470
+ # a match is found.
471
+ # @param [true, false] return_string Whether to return the matched segment.
472
+ # @return [String, Fixnum, nil] The matched segment if `return_string` is
473
+ # `true`, otherwise the number of characters advanced. `nil` if nothing
474
+ # matched.
475
+
476
+ def scan_full(pattern, advance_pointer, return_string)
477
+ do_scan pattern, advance_pointer, return_string, true
478
+ end
479
+
480
+ # Scans the string _until_ the `pattern` is matched. Returns the substring up
481
+ # to and including the end of the match, advancing the scan pointer to that
482
+ # location. If there is no match, `nil` is returned.
483
+ #
484
+ # @param [Regexp] pattern The pattern to match.
485
+ # @return [String, nil] The segment that matched.
486
+ #
487
+ # @example
488
+ # s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
489
+ # s.scan_until(/1/) # -> "Fri Dec 1"
490
+ # s.pre_match # -> "Fri Dec "
491
+ # s.scan_until(/XYZ/) # -> nil
492
+
493
+ def scan_until(pattern)
494
+ do_scan pattern, true, true, false
495
+ end
496
+
497
+ # Scans the string `until` the pattern is matched. Advances the scan pointer
498
+ # if `advance_pointer`, otherwise not. Returns the matched string if
499
+ # `return_string` is `true`, otherwise returns the number of characters
500
+ # advanced. This method does affect the match register.
501
+ #
502
+ # @param [Regexp] pattern The pattern to scan.
503
+ # @param [true, false] advance_pointer Whether to advance the scan pointer if
504
+ # a match is found.
505
+ # @param [true, false] return_string Whether to return the matched segment.
506
+ # @return [String, Fixnum, nil] The matched segment if `return_string` is
507
+ # `true`, otherwise the number of characters advanced. `nil` if nothing
508
+ # matched.
509
+
510
+ def search_full(pattern, advance_pointer, return_string)
511
+ do_scan pattern, advance_pointer, return_string, false
512
+ end
513
+
514
+ # Attempts to skip over the given `pattern` beginning with the scan pointer.
515
+ # If it matches, the scan pointer is advanced to the end of the match, and the
516
+ # length of the match is returned. Otherwise, `nil` is returned.
517
+ #
518
+ # It's similar to {#scan}, but without returning the matched string.
519
+ #
520
+ # @param [Regexp] pattern The pattern to match.
521
+ # @return [Fixnum, nil] The number of characters advanced, if matched.
522
+ #
523
+ # @example
524
+ # s = UnicodeScanner.new('test string')
525
+ # p s.skip(/\w+/) # -> 4
526
+ # p s.skip(/\w+/) # -> nil
527
+ # p s.skip(/\s+/) # -> 1
528
+ # p s.skip(/\w+/) # -> 6
529
+ # p s.skip(/./) # -> nil
530
+
531
+ def skip(pattern)
532
+ do_scan pattern, true, false, true
533
+ end
534
+
535
+ # Advances the scan pointer until `pattern` is matched and consumed. Returns
536
+ # the number of characters advanced, or `nil` if no match was found.
537
+ #
538
+ # Look ahead to match `pattern`, and advance the scan pointer to the _end_ of
539
+ # the match. Return the number of characters advanced, or `nil` if the match
540
+ # was unsuccessful.
541
+ #
542
+ # It's similar to {#scan_until}, but without returning the intervening string.
543
+ #
544
+ # @param [Regexp] pattern The pattern to match.
545
+ # @return [Fixnum, nil] The number of characters advanced, if matched.
546
+
547
+ def skip_until(pattern)
548
+ do_scan pattern, true, false, false
549
+ end
550
+
551
+ # @return [String] The string being scanned.
552
+
553
+ def string() @string end
554
+
555
+ # Changes the string being scanned to `str` and resets the scanner.
556
+ #
557
+ # @param [String] str The new string to scan.
558
+ # @return [String] `str`
559
+
560
+ def string=(str)
561
+ @string = str
562
+ @matched = false
563
+ @current = 0
564
+ str
565
+ end
566
+
567
+ # Set the scan pointer to the end of the string and clear matching data.
568
+
569
+ def terminate
570
+ @current = @string.length
571
+ @matched = false
572
+ self
573
+ end
574
+ alias clear terminate
575
+
576
+ # Set the scan pointer to the previous position. Only one previous position is
577
+ # remembered, and it changes with each scanning operation.
578
+ #
579
+ # @example
580
+ # s = UnicodeScanner.new('test string')
581
+ # s.scan(/\w+/) # => "test"
582
+ # s.unscan
583
+ # s.scan(/../) # => "te"
584
+ # s.scan(/\d/) # => nil
585
+ # s.unscan # ScanError: unscan failed: previous match record not exist
586
+
587
+ def unscan
588
+ raise ScanError, "unscan failed: previous match record not exist" unless @matched
589
+ @current = @previous
590
+ @matched = false
591
+ self
592
+ end
593
+
594
+ private
595
+
596
+ def do_scan(regex, advance_pointer, return_string, head_only)
597
+ raise ArgumentError unless regex.kind_of?(Regexp)
598
+
599
+ @matched = false
600
+ return nil if eos?
601
+
602
+ @matches = regex.match(@string[@current, @string.length])
603
+ return nil unless @matches
604
+
605
+ if head_only && @matches.begin(0) > 0
606
+ @matches = nil
607
+ return nil
608
+ end
609
+
610
+ @matched = true
611
+
612
+ @previous = @current
613
+ @current += @matches.end(0) if advance_pointer
614
+ if return_string
615
+ return @string[@previous, @matches.end(0)]
616
+ else
617
+ return @matches.end(0)
618
+ end
619
+ end
620
+
621
+ def inspect_before # inspect1
622
+ return '' if @current == 0
623
+
624
+ str = String.new
625
+ len = 0
626
+
627
+ if @current > INSPECT_LENGTH
628
+ str << '...'
629
+ len = INSPECT_LENGTH
630
+ else
631
+ len = @current
632
+ end
633
+
634
+ str << @string[@current - len, len]
635
+ return str
636
+ end
637
+
638
+ def inspect_after # inspect2
639
+ return '' if eos?
640
+
641
+ str = String.new
642
+ len = @string.length - @current
643
+ if len > INSPECT_LENGTH
644
+ len = INSPECT_LENGTH
645
+ str << @string[@current, len]
646
+ str << '...'
647
+ else
648
+ str << @string[@current, len]
649
+ end
650
+
651
+ return str
652
+ end
653
+ end
654
+
655
+ class ScanError < StandardError; end
@@ -0,0 +1,12 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'unicode_scanner'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
9
+
10
+ RSpec.configure do |config|
11
+
12
+ end
@@ -0,0 +1,206 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
4
+
5
+ describe UnicodeScanner do
6
+ it "should pass all the class-level examples" do
7
+ s = UnicodeScanner.new('This is an example string')
8
+ s.eos?.should == false
9
+
10
+ s.scan(/\w+/).should == "This"
11
+ s.scan(/\w+/).should == nil
12
+ s.scan(/\s+/).should == " "
13
+ s.scan(/\s+/).should == nil
14
+ s.scan(/\w+/).should == "is"
15
+ s.eos?.should == false
16
+
17
+ s.scan(/\s+/).should == " "
18
+ s.scan(/\w+/).should == "an"
19
+ s.scan(/\s+/).should == " "
20
+ s.scan(/\w+/).should == "example"
21
+ s.scan(/\s+/).should == " "
22
+ s.scan(/\w+/).should == "string"
23
+ s.eos?.should == true
24
+
25
+ s.scan(/\s+/).should == nil
26
+ s.scan(/\w+/).should == nil
27
+ end
28
+
29
+ it "should pass the #concat example" do
30
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
31
+ s.scan(/Fri /)
32
+ s << " +1000 GMT"
33
+ s.string.should == "Fri Dec 12 1975 14:39 +1000 GMT"
34
+ s.scan(/Dec/).should == "Dec"
35
+ end
36
+
37
+ it "should pass the #[] example" do
38
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
39
+ s.scan(/(\w+) (\w+) (\d+) /).should == "Fri Dec 12 "
40
+ s[0].should == "Fri Dec 12 "
41
+ s[1].should == "Fri"
42
+ s[2].should == "Dec"
43
+ s[3].should == "12"
44
+ s.post_match.should == "1975 14:39"
45
+ s.pre_match.should == ""
46
+ end
47
+
48
+ it "should pass the #beginning_of_line? example" do
49
+ s = UnicodeScanner.new("test\ntest\n")
50
+ s.bol?.should == true
51
+ s.scan(/te/)
52
+ s.bol?.should == false
53
+ s.scan(/st\n/)
54
+ s.bol?.should == true
55
+ s.terminate
56
+ s.bol?.should == true
57
+ end
58
+
59
+ it "should pass the #check example" do
60
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
61
+ s.check(/Fri/).should == "Fri"
62
+ s.pos.should == 0
63
+ s.matched.should == "Fri"
64
+ s.check(/12/).should == nil
65
+ s.matched.should == nil
66
+ end
67
+
68
+ it "should pass the #check_until example" do
69
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
70
+ s.check_until(/12/).should == "Fri Dec 12"
71
+ s.pos.should == 0
72
+ s.matched.should == "12"
73
+ end
74
+
75
+ it "should pass the #eos? example" do
76
+ s = UnicodeScanner.new('test string')
77
+ s.eos?.should == false
78
+ s.scan(/test/)
79
+ s.eos?.should == false
80
+ s.terminate
81
+ s.eos?.should == true
82
+ end
83
+
84
+ it "should pass the #exist? example" do
85
+ s = UnicodeScanner.new('test string')
86
+ s.exist?(/s/).should == 3
87
+ s.scan(/test/).should == "test"
88
+ s.exist?(/s/).should == 2
89
+ s.exist?(/e/).should == nil
90
+ end
91
+
92
+ it "should pass a tweaked version of the #getch example" do
93
+ s = UnicodeScanner.new("ab")
94
+ s.getch.should == "a"
95
+ s.getch.should == "b"
96
+ s.getch.should == nil
97
+
98
+ s = UnicodeScanner.new("ぁ")
99
+ s.getch.should == "ぁ" # Japanese hira-kana "A" in EUC-JP
100
+ s.getch.should == nil
101
+ end
102
+
103
+ it "should pass the #inspect example" do
104
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
105
+ s.inspect.should == '#<UnicodeScanner 0/21 @ "Fri D...">'
106
+ s.scan_until(/12/).should == "Fri Dec 12"
107
+ s.inspect.should == '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
108
+ end
109
+
110
+ it "should pass the #match? example" do
111
+ s = UnicodeScanner.new('test string')
112
+ s.match?(/\w+/).should == 4
113
+ s.match?(/\w+/).should == 4
114
+ s.match?(/\s+/).should == nil
115
+ end
116
+
117
+ it "should pass the #matched example" do
118
+ s = UnicodeScanner.new('test string')
119
+ s.match?(/\w+/).should == 4
120
+ s.matched.should == "test"
121
+ end
122
+
123
+ it "should pass the #matched? example" do
124
+ s = UnicodeScanner.new('test string')
125
+ s.match?(/\w+/).should == 4
126
+ s.matched?.should == true
127
+ s.match?(/\d+/).should == nil
128
+ s.matched?.should == false
129
+ end
130
+
131
+ it "should pass the #matched_size example" do
132
+ s = UnicodeScanner.new('test string')
133
+ s.check(/\w+/).should == "test"
134
+ s.matched_size.should == 4
135
+ s.check(/\d+/).should == nil
136
+ s.matched_size.should == nil
137
+ end
138
+
139
+ it "should pass the #peek example" do
140
+ s = UnicodeScanner.new('test string')
141
+ s.peek(7).should == "test st"
142
+ s.peek(7).should == "test st"
143
+ end
144
+
145
+ it "should pass the #pos example" do
146
+ s = UnicodeScanner.new('test string')
147
+ s.pos.should == 0
148
+ s.scan_until(/str/).should == "test str"
149
+ s.pos.should == 8
150
+ s.terminate.inspect.should == "#<UnicodeScanner fin>"
151
+ s.pos.should == 11
152
+ end
153
+
154
+ it "should pass the #pos= example" do
155
+ s = UnicodeScanner.new('test string')
156
+ (s.pos = 7).should == 7
157
+ s.rest.should == "ring"
158
+ end
159
+
160
+ it "should pass the #post_match/#pre_match example" do
161
+ s = UnicodeScanner.new('test string')
162
+ s.scan(/\w+/).should == "test"
163
+ s.scan(/\s+/).should == " "
164
+ s.pre_match.should == "test"
165
+ s.post_match.should == "string"
166
+ end
167
+
168
+ it "should pass the #scan example" do
169
+ s = UnicodeScanner.new('test string')
170
+ s.scan(/\w+/).should == "test"
171
+ s.scan(/\w+/).should == nil
172
+ s.scan(/\s+/).should == " "
173
+ s.scan(/\w+/).should == "string"
174
+ s.scan(/./).should == nil
175
+ end
176
+
177
+ it "should pass the #scan_until example" do
178
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
179
+ s.scan_until(/1/).should == "Fri Dec 1"
180
+ s.pre_match.should == "Fri Dec "
181
+ s.scan_until(/XYZ/).should == nil
182
+ end
183
+
184
+ it "should pass the #skip example" do
185
+ s = UnicodeScanner.new('test string')
186
+ s.skip(/\w+/).should == 4
187
+ s.skip(/\w+/).should == nil
188
+ s.skip(/\s+/).should == 1
189
+ s.skip(/\w+/).should == 6
190
+ s.skip(/./).should == nil
191
+ end
192
+
193
+ it "should pass the half-finished #skip_until example" do
194
+ s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
195
+ s.skip_until(/12/).should == 10
196
+ end
197
+
198
+ it "should pass the #unscan example" do
199
+ s = UnicodeScanner.new('test string')
200
+ s.scan(/\w+/).should == "test"
201
+ s.unscan
202
+ s.scan(/../).should == "te"
203
+ s.scan(/\d/).should == nil
204
+ -> { s.unscan }.should raise_error(ScanError, 'unscan failed: previous match record not exist')
205
+ end
206
+ end
@@ -0,0 +1,64 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "unicode_scanner"
8
+ s.version = "1.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Tim Morgan"]
12
+ s.date = "2012-07-12"
13
+ s.description = "An implementation of StringScanner that doesn't split multibyte characters."
14
+ s.email = "git@timothymorgan.info"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".rvmrc",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.md",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "lib/unicode_scanner.rb",
30
+ "spec/spec_helper.rb",
31
+ "spec/unicode_scanner_spec.rb",
32
+ "unicode_scanner.gemspec"
33
+ ]
34
+ s.homepage = "http://github.com/RISCfuture/unicode_scanner"
35
+ s.licenses = ["MIT"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = "1.8.24"
38
+ s.summary = "Unicode-aware implementation of StringScanner"
39
+
40
+ if s.respond_to? :specification_version then
41
+ s.specification_version = 3
42
+
43
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
44
+ s.add_development_dependency(%q<rspec>, [">= 0"])
45
+ s.add_development_dependency(%q<redcarpet>, [">= 0"])
46
+ s.add_development_dependency(%q<yard>, [">= 0"])
47
+ s.add_development_dependency(%q<bundler>, [">= 0"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ else
50
+ s.add_dependency(%q<rspec>, [">= 0"])
51
+ s.add_dependency(%q<redcarpet>, [">= 0"])
52
+ s.add_dependency(%q<yard>, [">= 0"])
53
+ s.add_dependency(%q<bundler>, [">= 0"])
54
+ s.add_dependency(%q<jeweler>, [">= 0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<rspec>, [">= 0"])
58
+ s.add_dependency(%q<redcarpet>, [">= 0"])
59
+ s.add_dependency(%q<yard>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, [">= 0"])
61
+ s.add_dependency(%q<jeweler>, [">= 0"])
62
+ end
63
+ end
64
+
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode_scanner
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tim Morgan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: redcarpet
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: yard
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: jeweler
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: An implementation of StringScanner that doesn't split multibyte characters.
95
+ email: git@timothymorgan.info
96
+ executables: []
97
+ extensions: []
98
+ extra_rdoc_files:
99
+ - LICENSE.txt
100
+ - README.md
101
+ files:
102
+ - .document
103
+ - .rspec
104
+ - .rvmrc
105
+ - Gemfile
106
+ - Gemfile.lock
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - VERSION
111
+ - lib/unicode_scanner.rb
112
+ - spec/spec_helper.rb
113
+ - spec/unicode_scanner_spec.rb
114
+ - unicode_scanner.gemspec
115
+ homepage: http://github.com/RISCfuture/unicode_scanner
116
+ licenses:
117
+ - MIT
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ segments:
129
+ - 0
130
+ hash: -3935821298050612576
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 1.8.24
140
+ signing_key:
141
+ specification_version: 3
142
+ summary: Unicode-aware implementation of StringScanner
143
+ test_files: []