rbs 3.3.2 → 3.4.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/comments.yml +2 -5
  3. data/.github/workflows/ruby.yml +7 -8
  4. data/.github/workflows/typecheck.yml +37 -0
  5. data/CHANGELOG.md +50 -0
  6. data/Gemfile +1 -1
  7. data/Gemfile.lock +11 -11
  8. data/Steepfile +2 -2
  9. data/core/array.rbs +19 -49
  10. data/core/basic_object.rbs +2 -2
  11. data/core/comparable.rbs +17 -8
  12. data/core/complex.rbs +82 -43
  13. data/core/data.rbs +2 -4
  14. data/core/dir.rbs +635 -295
  15. data/core/enumerable.rbs +11 -18
  16. data/core/enumerator.rbs +37 -31
  17. data/core/errors.rbs +4 -0
  18. data/core/false_class.rbs +34 -15
  19. data/core/fiber.rbs +23 -0
  20. data/core/file.rbs +329 -120
  21. data/core/float.rbs +17 -32
  22. data/core/gc.rbs +17 -11
  23. data/core/hash.rbs +22 -44
  24. data/core/integer.rbs +82 -113
  25. data/core/io/buffer.rbs +90 -47
  26. data/core/io.rbs +39 -116
  27. data/core/kernel.rbs +442 -489
  28. data/core/match_data.rbs +55 -56
  29. data/core/module.rbs +45 -1
  30. data/core/nil_class.rbs +98 -35
  31. data/core/numeric.rbs +22 -32
  32. data/core/object_space/weak_key_map.rbs +102 -0
  33. data/core/process.rbs +1242 -655
  34. data/core/ractor.rbs +139 -120
  35. data/core/range.rbs +100 -4
  36. data/core/rational.rbs +0 -4
  37. data/core/rbs/unnamed/argf.rbs +16 -8
  38. data/core/rbs/unnamed/env_class.rbs +0 -24
  39. data/core/refinement.rbs +8 -0
  40. data/core/regexp.rbs +1149 -598
  41. data/core/ruby_vm.rbs +126 -12
  42. data/core/rubygems/platform.rbs +9 -0
  43. data/core/rubygems/rubygems.rbs +1 -1
  44. data/core/rubygems/version.rbs +5 -1
  45. data/core/set.rbs +20 -22
  46. data/core/signal.rbs +4 -4
  47. data/core/string.rbs +283 -230
  48. data/core/string_io.rbs +2 -14
  49. data/core/struct.rbs +404 -24
  50. data/core/symbol.rbs +1 -19
  51. data/core/thread.rbs +29 -12
  52. data/core/time.rbs +227 -104
  53. data/core/trace_point.rbs +2 -5
  54. data/core/true_class.rbs +54 -21
  55. data/core/warning.rbs +14 -11
  56. data/docs/data_and_struct.md +29 -0
  57. data/docs/syntax.md +3 -5
  58. data/docs/tools.md +1 -0
  59. data/ext/rbs_extension/lexer.c +643 -559
  60. data/ext/rbs_extension/lexer.re +5 -1
  61. data/ext/rbs_extension/parser.c +12 -3
  62. data/ext/rbs_extension/unescape.c +7 -47
  63. data/lib/rbs/cli/diff.rb +4 -1
  64. data/lib/rbs/cli/validate.rb +280 -0
  65. data/lib/rbs/cli.rb +2 -194
  66. data/lib/rbs/collection/config.rb +5 -6
  67. data/lib/rbs/collection/sources/git.rb +1 -1
  68. data/lib/rbs/collection.rb +1 -0
  69. data/lib/rbs/diff.rb +7 -4
  70. data/lib/rbs/errors.rb +11 -0
  71. data/lib/rbs/test/errors.rb +4 -1
  72. data/lib/rbs/test/guaranteed.rb +2 -3
  73. data/lib/rbs/test/type_check.rb +15 -10
  74. data/lib/rbs/test.rb +3 -3
  75. data/lib/rbs/types.rb +29 -0
  76. data/lib/rbs/unit_test/convertibles.rb +176 -0
  77. data/lib/rbs/unit_test/spy.rb +136 -0
  78. data/lib/rbs/unit_test/type_assertions.rb +341 -0
  79. data/lib/rbs/unit_test/with_aliases.rb +143 -0
  80. data/lib/rbs/unit_test.rb +6 -0
  81. data/lib/rbs/version.rb +1 -1
  82. data/sig/cli/validate.rbs +43 -0
  83. data/sig/diff.rbs +3 -1
  84. data/sig/errors.rbs +8 -0
  85. data/sig/rbs.rbs +1 -1
  86. data/sig/test/errors.rbs +52 -0
  87. data/sig/test/guranteed.rbs +9 -0
  88. data/sig/test/type_check.rbs +19 -0
  89. data/sig/test.rbs +82 -0
  90. data/sig/types.rbs +6 -1
  91. data/sig/unit_test/convertibles.rbs +154 -0
  92. data/sig/unit_test/spy.rbs +28 -0
  93. data/sig/unit_test/type_assertions.rbs +194 -0
  94. data/sig/unit_test/with_aliases.rbs +136 -0
  95. data/stdlib/base64/0/base64.rbs +307 -45
  96. data/stdlib/bigdecimal/0/big_decimal.rbs +35 -15
  97. data/stdlib/coverage/0/coverage.rbs +2 -2
  98. data/stdlib/csv/0/csv.rbs +25 -55
  99. data/stdlib/date/0/date.rbs +1 -43
  100. data/stdlib/date/0/date_time.rbs +1 -13
  101. data/stdlib/delegate/0/delegator.rbs +186 -0
  102. data/stdlib/delegate/0/kernel.rbs +47 -0
  103. data/stdlib/delegate/0/simple_delegator.rbs +98 -0
  104. data/stdlib/did_you_mean/0/did_you_mean.rbs +1 -1
  105. data/stdlib/erb/0/erb.rbs +2 -2
  106. data/stdlib/fileutils/0/fileutils.rbs +0 -19
  107. data/stdlib/io-console/0/io-console.rbs +12 -1
  108. data/stdlib/ipaddr/0/ipaddr.rbs +2 -1
  109. data/stdlib/json/0/json.rbs +320 -81
  110. data/stdlib/logger/0/logger.rbs +9 -5
  111. data/stdlib/monitor/0/monitor.rbs +78 -0
  112. data/stdlib/net-http/0/net-http.rbs +1880 -543
  113. data/stdlib/objspace/0/objspace.rbs +19 -13
  114. data/stdlib/openssl/0/openssl.rbs +508 -127
  115. data/stdlib/optparse/0/optparse.rbs +25 -11
  116. data/stdlib/pathname/0/pathname.rbs +1 -1
  117. data/stdlib/pp/0/pp.rbs +2 -5
  118. data/stdlib/prettyprint/0/prettyprint.rbs +2 -2
  119. data/stdlib/pstore/0/pstore.rbs +2 -4
  120. data/stdlib/rdoc/0/comment.rbs +1 -2
  121. data/stdlib/resolv/0/resolv.rbs +4 -2
  122. data/stdlib/socket/0/socket.rbs +2 -2
  123. data/stdlib/socket/0/unix_socket.rbs +2 -2
  124. data/stdlib/strscan/0/string_scanner.rbs +3 -2
  125. data/stdlib/tempfile/0/tempfile.rbs +1 -1
  126. data/stdlib/uri/0/common.rbs +245 -123
  127. metadata +23 -4
  128. data/lib/rbs/test/spy.rb +0 -6
data/core/regexp.rbs CHANGED
@@ -1,116 +1,271 @@
1
1
  # <!-- rdoc-file=re.c -->
2
- # Regular expressions (*regexp*s) are patterns which describe the contents of a
3
- # string. They're used for testing whether a string contains a given pattern, or
4
- # extracting the portions that match. They are created with the `/`*pat*`/` and
5
- # `%r{`*pat*`}` literals or the `Regexp.new` constructor.
2
+ # A [regular expression](https://en.wikipedia.org/wiki/Regular_expression) (also
3
+ # called a *regexp*) is a *match pattern* (also simply called a *pattern*).
6
4
  #
7
- # A regexp is usually delimited with forward slashes (`/`). For example:
5
+ # A common notation for a regexp uses enclosing slash characters:
8
6
  #
9
- # /hay/ =~ 'haystack' #=> 0
10
- # /y/.match('haystack') #=> #<MatchData "y">
7
+ # /foo/
11
8
  #
12
- # If a string contains the pattern it is said to *match*. A literal string
13
- # matches itself.
9
+ # A regexp may be applied to a *target string*; The part of the string (if any)
10
+ # that matches the pattern is called a *match*, and may be said *to match*:
14
11
  #
15
- # Here 'haystack' does not contain the pattern 'needle', so it doesn't match:
12
+ # re = /red/
13
+ # re.match?('redirect') # => true # Match at beginning of target.
14
+ # re.match?('bored') # => true # Match at end of target.
15
+ # re.match?('credit') # => true # Match within target.
16
+ # re.match?('foo') # => false # No match.
16
17
  #
17
- # /needle/.match('haystack') #=> nil
18
+ # ## Regexp Uses
18
19
  #
19
- # Here 'haystack' contains the pattern 'hay', so it matches:
20
+ # A regexp may be used:
20
21
  #
21
- # /hay/.match('haystack') #=> #<MatchData "hay">
22
+ # * To extract substrings based on a given pattern:
22
23
  #
23
- # Specifically, `/st/` requires that the string contains the letter *s* followed
24
- # by the letter *t*, so it matches *haystack*, also.
24
+ # re = /foo/ # => /foo/
25
+ # re.match('food') # => #<MatchData "foo">
26
+ # re.match('good') # => nil
25
27
  #
26
- # Note that any Regexp matching will raise a RuntimeError if timeout is set and
27
- # exceeded. See ["Timeout"](#label-Timeout) section in detail.
28
+ # See sections [Method match](rdoc-ref:Regexp@Method+match) and [Operator
29
+ # =~](rdoc-ref:Regexp@Operator+-3D~).
28
30
  #
29
- # ## Regexp Interpolation
31
+ # * To determine whether a string matches a given pattern:
30
32
  #
31
- # A regexp may contain interpolated strings; trivially:
33
+ # re.match?('food') # => true
34
+ # re.match?('good') # => false
32
35
  #
33
- # foo = 'bar'
34
- # /#{foo}/ # => /bar/
36
+ # See section [Method match?](rdoc-ref:Regexp@Method+match-3F).
35
37
  #
36
- # ## `=~` and Regexp#match
38
+ # * As an argument for calls to certain methods in other classes and modules;
39
+ # most such methods accept an argument that may be either a string or the
40
+ # (much more powerful) regexp.
37
41
  #
38
- # Pattern matching may be achieved by using `=~` operator or Regexp#match
39
- # method.
42
+ # See [Regexp Methods](rdoc-ref:regexp/methods.rdoc).
40
43
  #
41
- # ### `=~` Operator
42
44
  #
43
- # `=~` is Ruby's basic pattern-matching operator. When one operand is a regular
44
- # expression and the other is a string then the regular expression is used as a
45
- # pattern to match against the string. (This operator is equivalently defined
46
- # by Regexp and String so the order of String and Regexp do not matter. Other
47
- # classes may have different implementations of `=~`.) If a match is found, the
48
- # operator returns index of first match in string, otherwise it returns `nil`.
45
+ # ## Regexp Objects
49
46
  #
50
- # /hay/ =~ 'haystack' #=> 0
51
- # 'haystack' =~ /hay/ #=> 0
52
- # /a/ =~ 'haystack' #=> 1
53
- # /u/ =~ 'haystack' #=> nil
47
+ # A regexp object has:
54
48
  #
55
- # Using `=~` operator with a String and Regexp the `$~` global variable is set
56
- # after a successful match. `$~` holds a MatchData object. Regexp.last_match is
57
- # equivalent to `$~`.
49
+ # * A source; see [Sources](rdoc-ref:Regexp@Sources).
58
50
  #
59
- # ### Regexp#match Method
51
+ # * Several modes; see [Modes](rdoc-ref:Regexp@Modes).
60
52
  #
61
- # The #match method returns a MatchData object:
53
+ # * A timeout; see [Timeouts](rdoc-ref:Regexp@Timeouts).
62
54
  #
63
- # /st/.match('haystack') #=> #<MatchData "st">
55
+ # * An encoding; see [Encodings](rdoc-ref:Regexp@Encodings).
64
56
  #
65
- # ## Metacharacters and Escapes
66
57
  #
67
- # The following are *metacharacters* `(`, `)`, `[`, `]`, `{`, `}`, `.`, `?`,
68
- # `+`, `*`. They have a specific meaning when appearing in a pattern. To match
69
- # them literally they must be backslash-escaped. To match a backslash literally,
70
- # backslash-escape it: `\\\`.
58
+ # ## Creating a Regexp
71
59
  #
72
- # /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> #<MatchData "1 + 2 = 3?">
73
- # /a\\\\b/.match('a\\\\b') #=> #<MatchData "a\\b">
60
+ # A regular expression may be created with:
74
61
  #
75
- # Patterns behave like double-quoted strings and can contain the same backslash
76
- # escapes (the meaning of `\s` is different, however, see
77
- # [below](#label-Character+Classes)).
62
+ # * A regexp literal using slash characters (see [Regexp
63
+ # Literals](rdoc-ref:syntax/literals.rdoc@Regexp+Literals)):
78
64
  #
79
- # /\s\u{6771 4eac 90fd}/.match("Go to 東京都")
80
- # #=> #<MatchData " 東京都">
65
+ # # This is a very common usage.
66
+ # /foo/ # => /foo/
81
67
  #
82
- # Arbitrary Ruby expressions can be embedded into patterns with the `#{...}`
83
- # construct.
68
+ # * A `%r` regexp literal (see [%r: Regexp
69
+ # Literals](rdoc-ref:syntax/literals.rdoc@25r-3A+Regexp+Literals)):
84
70
  #
85
- # place = "東京都"
86
- # /#{place}/.match("Go to 東京都")
87
- # #=> #<MatchData "東京都">
71
+ # # Same delimiter character at beginning and end;
72
+ # # useful for avoiding escaping characters
73
+ # %r/name\/value pair/ # => /name\/value pair/
74
+ # %r:name/value pair: # => /name\/value pair/
75
+ # %r|name/value pair| # => /name\/value pair/
88
76
  #
89
- # ## Character Classes
77
+ # # Certain "paired" characters can be delimiters.
78
+ # %r[foo] # => /foo/
79
+ # %r{foo} # => /foo/
80
+ # %r(foo) # => /foo/
81
+ # %r<foo> # => /foo/
90
82
  #
91
- # A *character class* is delimited with square brackets (`[`, `]`) and lists
92
- # characters that may appear at that point in the match. `/[ab]/` means *a* or
93
- # *b*, as opposed to `/ab/` which means *a* followed by *b*.
83
+ # * Method Regexp.new.
94
84
  #
95
- # /W[aeiou]rd/.match("Word") #=> #<MatchData "Word">
96
85
  #
97
- # Within a character class the hyphen (`-`) is a metacharacter denoting an
98
- # inclusive range of characters. `[abcd]` is equivalent to `[a-d]`. A range can
99
- # be followed by another range, so `[abcdwxyz]` is equivalent to `[a-dw-z]`. The
100
- # order in which ranges or individual characters appear inside a character class
101
- # is irrelevant.
86
+ # ## Method `match`
102
87
  #
103
- # /[0-9a-f]/.match('9f') #=> #<MatchData "9">
104
- # /[9f]/.match('9f') #=> #<MatchData "9">
88
+ # Each of the methods Regexp#match, String#match, and Symbol#match returns a
89
+ # MatchData object if a match was found, `nil` otherwise; each also sets [global
90
+ # variables](rdoc-ref:Regexp@Global+Variables):
105
91
  #
106
- # If the first character of a character class is a caret (`^`) the class is
107
- # inverted: it matches any character *except* those named.
92
+ # 'food'.match(/foo/) # => #<MatchData "foo">
93
+ # 'food'.match(/bar/) # => nil
108
94
  #
109
- # /[^a-eg-z]/.match('f') #=> #<MatchData "f">
95
+ # ## Operator `=~`
96
+ #
97
+ # Each of the operators Regexp#=~, String#=~, and Symbol#=~ returns an integer
98
+ # offset if a match was found, `nil` otherwise; each also sets [global
99
+ # variables](rdoc-ref:Regexp@Global+Variables):
100
+ #
101
+ # /bar/ =~ 'foo bar' # => 4
102
+ # 'foo bar' =~ /bar/ # => 4
103
+ # /baz/ =~ 'foo bar' # => nil
104
+ #
105
+ # ## Method `match?`
106
+ #
107
+ # Each of the methods Regexp#match?, String#match?, and Symbol#match? returns
108
+ # `true` if a match was found, `false` otherwise; none sets [global
109
+ # variables](rdoc-ref:Regexp@Global+Variables):
110
+ #
111
+ # 'food'.match?(/foo/) # => true
112
+ # 'food'.match?(/bar/) # => false
113
+ #
114
+ # ## Global Variables
115
+ #
116
+ # Certain regexp-oriented methods assign values to global variables:
117
+ #
118
+ # * `#match`: see [Method match](rdoc-ref:Regexp@Method+match).
119
+ # * `#=~`: see [Operator =~](rdoc-ref:Regexp@Operator+-3D~).
120
+ #
121
+ #
122
+ # The affected global variables are:
123
+ #
124
+ # * `$~`: Returns a MatchData object, or `nil`.
125
+ # * `$&`: Returns the matched part of the string, or `nil`.
126
+ # * `$``: Returns the part of the string to the left of the match, or `nil`.
127
+ # * `$'`: Returns the part of the string to the right of the match, or `nil`.
128
+ # * `$+`: Returns the last group matched, or `nil`.
129
+ # * `$1`, `$2`, etc.: Returns the first, second, etc., matched group, or
130
+ # `nil`. Note that `$0` is quite different; it returns the name of the
131
+ # currently executing program.
132
+ #
133
+ #
134
+ # Examples:
135
+ #
136
+ # # Matched string, but no matched groups.
137
+ # 'foo bar bar baz'.match('bar')
138
+ # $~ # => #<MatchData "bar">
139
+ # $& # => "bar"
140
+ # $` # => "foo "
141
+ # $' # => " bar baz"
142
+ # $+ # => nil
143
+ # $1 # => nil
144
+ #
145
+ # # Matched groups.
146
+ # /s(\w{2}).*(c)/.match('haystack')
147
+ # $~ # => #<MatchData "stac" 1:"ta" 2:"c">
148
+ # $& # => "stac"
149
+ # $` # => "hay"
150
+ # $' # => "k"
151
+ # $+ # => "c"
152
+ # $1 # => "ta"
153
+ # $2 # => "c"
154
+ # $3 # => nil
155
+ #
156
+ # # No match.
157
+ # 'foo'.match('bar')
158
+ # $~ # => nil
159
+ # $& # => nil
160
+ # $` # => nil
161
+ # $' # => nil
162
+ # $+ # => nil
163
+ # $1 # => nil
164
+ #
165
+ # Note that Regexp#match?, String#match?, and Symbol#match? do not set global
166
+ # variables.
167
+ #
168
+ # ## Sources
169
+ #
170
+ # As seen above, the simplest regexp uses a literal expression as its source:
171
+ #
172
+ # re = /foo/ # => /foo/
173
+ # re.match('food') # => #<MatchData "foo">
174
+ # re.match('good') # => nil
175
+ #
176
+ # A rich collection of available *subexpressions* gives the regexp great power
177
+ # and flexibility:
178
+ #
179
+ # * [Special characters](rdoc-ref:Regexp@Special+Characters)
180
+ # * [Source literals](rdoc-ref:Regexp@Source+Literals)
181
+ # * [Character classes](rdoc-ref:Regexp@Character+Classes)
182
+ # * [Shorthand character classes](rdoc-ref:Regexp@Shorthand+Character+Classes)
183
+ # * [Anchors](rdoc-ref:Regexp@Anchors)
184
+ # * [Alternation](rdoc-ref:Regexp@Alternation)
185
+ # * [Quantifiers](rdoc-ref:Regexp@Quantifiers)
186
+ # * [Groups and captures](rdoc-ref:Regexp@Groups+and+Captures)
187
+ # * [Unicode](rdoc-ref:Regexp@Unicode)
188
+ # * [POSIX Bracket Expressions](rdoc-ref:Regexp@POSIX+Bracket+Expressions)
189
+ # * [Comments](rdoc-ref:Regexp@Comments)
190
+ #
191
+ #
192
+ # ### Special Characters
193
+ #
194
+ # Regexp special characters, called *metacharacters*, have special meanings in
195
+ # certain contexts; depending on the context, these are sometimes
196
+ # metacharacters:
197
+ #
198
+ # . ? - + * ^ \ | $ ( ) [ ] { }
199
+ #
200
+ # To match a metacharacter literally, backslash-escape it:
201
+ #
202
+ # # Matches one or more 'o' characters.
203
+ # /o+/.match('foo') # => #<MatchData "oo">
204
+ # # Would match 'o+'.
205
+ # /o\+/.match('foo') # => nil
206
+ #
207
+ # To match a backslash literally, backslash-escape it:
208
+ #
209
+ # /\./.match('\.') # => #<MatchData ".">
210
+ # /\\./.match('\.') # => #<MatchData "\\.">
211
+ #
212
+ # Method Regexp.escape returns an escaped string:
213
+ #
214
+ # Regexp.escape('.?-+*^\|$()[]{}')
215
+ # # => "\\.\\?\\-\\+\\*\\^\\\\\\|\\$\\(\\)\\[\\]\\{\\}"
216
+ #
217
+ # ### Source Literals
218
+ #
219
+ # The source literal largely behaves like a double-quoted string; see [String
220
+ # Literals](rdoc-ref:syntax/literals.rdoc@String+Literals).
221
+ #
222
+ # In particular, a source literal may contain interpolated expressions:
223
+ #
224
+ # s = 'foo' # => "foo"
225
+ # /#{s}/ # => /foo/
226
+ # /#{s.capitalize}/ # => /Foo/
227
+ # /#{2 + 2}/ # => /4/
228
+ #
229
+ # There are differences between an ordinary string literal and a source literal;
230
+ # see [Shorthand Character
231
+ # Classes](rdoc-ref:Regexp@Shorthand+Character+Classes).
232
+ #
233
+ # * `\s` in an ordinary string literal is equivalent to a space character; in
234
+ # a source literal, it's shorthand for matching a whitespace character.
235
+ # * In an ordinary string literal, these are (needlessly) escaped characters;
236
+ # in a source literal, they are shorthands for various matching characters:
237
+ #
238
+ # \w \W \d \D \h \H \S \R
239
+ #
240
+ #
241
+ # ### Character Classes
242
+ #
243
+ # A *character class* is delimited by square brackets; it specifies that certain
244
+ # characters match at a given point in the target string:
245
+ #
246
+ # # This character class will match any vowel.
247
+ # re = /B[aeiou]rd/
248
+ # re.match('Bird') # => #<MatchData "Bird">
249
+ # re.match('Bard') # => #<MatchData "Bard">
250
+ # re.match('Byrd') # => nil
251
+ #
252
+ # A character class may contain hyphen characters to specify ranges of
253
+ # characters:
254
+ #
255
+ # # These regexps have the same effect.
256
+ # /[abcdef]/.match('foo') # => #<MatchData "f">
257
+ # /[a-f]/.match('foo') # => #<MatchData "f">
258
+ # /[a-cd-f]/.match('foo') # => #<MatchData "f">
259
+ #
260
+ # When the first character of a character class is a caret (`^`), the sense of
261
+ # the class is inverted: it matches any character *except* those specified.
262
+ #
263
+ # /[^a-eg-z]/.match('f') # => #<MatchData "f">
110
264
  #
111
265
  # A character class may contain another character class. By itself this isn't
112
- # useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`. However,
113
- # character classes also support the `&&` operator which performs set
266
+ # useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`.
267
+ #
268
+ # However, character classes also support the `&&` operator, which performs set
114
269
  # intersection on its arguments. The two can be combined as follows:
115
270
  #
116
271
  # /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z))
@@ -119,238 +274,481 @@
119
274
  #
120
275
  # /[abh-w]/
121
276
  #
122
- # The following metacharacters also behave like character classes:
123
- #
124
- # * `/./` - Any character except a newline.
125
- # * `/./m` - Any character (the `m` modifier enables multiline mode)
126
- # * `/\w/` - A word character (`[a-zA-Z0-9_]`)
127
- # * `/\W/` - A non-word character (`[^a-zA-Z0-9_]`). Please take a look at
128
- # [Bug #4044](https://bugs.ruby-lang.org/issues/4044) if using `/\W/` with
129
- # the `/i` modifier.
130
- # * `/\d/` - A digit character (`[0-9]`)
131
- # * `/\D/` - A non-digit character (`[^0-9]`)
132
- # * `/\h/` - A hexdigit character (`[0-9a-fA-F]`)
133
- # * `/\H/` - A non-hexdigit character (`[^0-9a-fA-F]`)
134
- # * `/\s/` - A whitespace character: `/[ \t\r\n\f\v]/`
135
- # * `/\S/` - A non-whitespace character: `/[^ \t\r\n\f\v]/`
136
- # * `/\R/` - A linebreak: `\n`, `\v`, `\f`, `\r` `\u0085` (NEXT LINE),
137
- # `\u2028` (LINE SEPARATOR), `\u2029` (PARAGRAPH SEPARATOR) or `\r\n`.
138
- #
139
- #
140
- # POSIX *bracket expressions* are also similar to character classes. They
141
- # provide a portable alternative to the above, with the added benefit that they
142
- # encompass non-ASCII characters. For instance, `/\d/` matches only the ASCII
143
- # decimal digits (0-9); whereas `/[[:digit:]]/` matches any character in the
144
- # Unicode *Nd* category.
145
- #
146
- # * `/[[:alnum:]]/` - Alphabetic and numeric character
147
- # * `/[[:alpha:]]/` - Alphabetic character
148
- # * `/[[:blank:]]/` - Space or tab
149
- # * `/[[:cntrl:]]/` - Control character
150
- # * `/[[:digit:]]/` - Digit
151
- # * `/[[:graph:]]/` - Non-blank character (excludes spaces, control
152
- # characters, and similar)
153
- # * `/[[:lower:]]/` - Lowercase alphabetical character
154
- # * `/[[:print:]]/` - Like [:graph:], but includes the space character
155
- # * `/[[:punct:]]/` - Punctuation character
156
- # * `/[[:space:]]/` - Whitespace character (`[:blank:]`, newline, carriage
157
- # return, etc.)
158
- # * `/[[:upper:]]/` - Uppercase alphabetical
159
- # * `/[[:xdigit:]]/` - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
277
+ # ### Shorthand Character Classes
160
278
  #
279
+ # Each of the following metacharacters serves as a shorthand for a character
280
+ # class:
161
281
  #
162
- # Ruby also supports the following non-POSIX character classes:
282
+ # * `/./`: Matches any character except a newline:
163
283
  #
164
- # * `/[[:word:]]/` - A character in one of the following Unicode general
165
- # categories *Letter*, *Mark*, *Number*, *Connector_Punctuation*
166
- # * `/[[:ascii:]]/` - A character in the ASCII character set
284
+ # /./.match('foo') # => #<MatchData "f">
285
+ # /./.match("\n") # => nil
167
286
  #
168
- # # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO"
169
- # /[[:digit:]]/.match("\u06F2") #=> #<MatchData "\u{06F2}">
170
- # /[[:upper:]][[:lower:]]/.match("Hello") #=> #<MatchData "He">
171
- # /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> #<MatchData "A6">
287
+ # * `/./m`: Matches any character, including a newline; see [Multiline
288
+ # Mode](rdoc-ref:Regexp@Multiline+Mode):
172
289
  #
290
+ # /./m.match("\n") # => #<MatchData "\n">
173
291
  #
174
- # ## Repetition
292
+ # * `/\w/`: Matches a word character: equivalent to `[a-zA-Z0-9_]`:
175
293
  #
176
- # The constructs described so far match a single character. They can be followed
177
- # by a repetition metacharacter to specify how many times they need to occur.
178
- # Such metacharacters are called *quantifiers*.
294
+ # /\w/.match(' foo') # => #<MatchData "f">
295
+ # /\w/.match(' _') # => #<MatchData "_">
296
+ # /\w/.match(' ') # => nil
179
297
  #
180
- # * `*` - Zero or more times
181
- # * `+` - One or more times
182
- # * `?` - Zero or one times (optional)
183
- # * `{`*n*`}` - Exactly *n* times
184
- # * `{`*n*`,}` - *n* or more times
185
- # * `{,`*m*`}` - *m* or less times
186
- # * `{`*n*`,`*m*`}` - At least *n* and at most *m* times
298
+ # * `/\W/`: Matches a non-word character: equivalent to `[^a-zA-Z0-9_]`:
187
299
  #
300
+ # /\W/.match(' ') # => #<MatchData " ">
301
+ # /\W/.match('_') # => nil
188
302
  #
189
- # At least one uppercase character ('H'), at least one lowercase character
190
- # ('e'), two 'l' characters, then one 'o':
303
+ # * `/\d/`: Matches a digit character: equivalent to `[0-9]`:
191
304
  #
192
- # "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> #<MatchData "Hello">
305
+ # /\d/.match('THX1138') # => #<MatchData "1">
306
+ # /\d/.match('foo') # => nil
193
307
  #
194
- # ### Greedy Match
308
+ # * `/\D/`: Matches a non-digit character: equivalent to `[^0-9]`:
195
309
  #
196
- # Repetition is *greedy* by default: as many occurrences as possible are matched
197
- # while still allowing the overall match to succeed. By contrast, *lazy*
198
- # matching makes the minimal amount of matches necessary for overall success.
199
- # Most greedy metacharacters can be made lazy by following them with `?`. For
200
- # the `{n}` pattern, because it specifies an exact number of characters to match
201
- # and not a variable number of characters, the `?` metacharacter instead makes
202
- # the repeated pattern optional.
310
+ # /\D/.match('123Jump!') # => #<MatchData "J">
311
+ # /\D/.match('123') # => nil
203
312
  #
204
- # Both patterns below match the string. The first uses a greedy quantifier so
205
- # '.+' matches '<a><b>'; the second uses a lazy quantifier so '.+?' matches
206
- # '<a>':
313
+ # * `/\h/`: Matches a hexdigit character: equivalent to `[0-9a-fA-F]`:
207
314
  #
208
- # /<.+>/.match("<a><b>") #=> #<MatchData "<a><b>">
209
- # /<.+?>/.match("<a><b>") #=> #<MatchData "<a>">
315
+ # /\h/.match('xyz fedcba9876543210') # => #<MatchData "f">
316
+ # /\h/.match('xyz') # => nil
210
317
  #
211
- # ### Possessive Match
318
+ # * `/\H/`: Matches a non-hexdigit character: equivalent to `[^0-9a-fA-F]`:
212
319
  #
213
- # A quantifier followed by `+` matches *possessively*: once it has matched it
214
- # does not backtrack. They behave like greedy quantifiers, but having matched
215
- # they refuse to "give up" their match even if this jeopardises the overall
216
- # match.
320
+ # /\H/.match('fedcba9876543210xyz') # => #<MatchData "x">
321
+ # /\H/.match('fedcba9876543210') # => nil
217
322
  #
218
- # /<.*><.+>/.match("<a><b>") #=> #<MatchData "<a><b>">
219
- # /<.*+><.+>/.match("<a><b>") #=> nil
220
- # /<.*><.++>/.match("<a><b>") #=> nil
323
+ # * `/\s/`: Matches a whitespace character: equivalent to `/[ \t\r\n\f\v]/`:
221
324
  #
222
- # ## Capturing
325
+ # /\s/.match('foo bar') # => #<MatchData " ">
326
+ # /\s/.match('foo') # => nil
223
327
  #
224
- # Parentheses can be used for *capturing*. The text enclosed by the *n*th group
225
- # of parentheses can be subsequently referred to with *n*. Within a pattern use
226
- # the *backreference* `\n` (e.g. `\1`); outside of the pattern use
227
- # `MatchData[n]` (e.g. `MatchData[1]`).
328
+ # * `/\S/`: Matches a non-whitespace character: equivalent to `/[^
329
+ # \t\r\n\f\v]/`:
228
330
  #
229
- # In this example, `'at'` is captured by the first group of parentheses, then
230
- # referred to later with `\1`:
331
+ # /\S/.match(" \t\r\n\f\v foo") # => #<MatchData "f">
332
+ # /\S/.match(" \t\r\n\f\v") # => nil
231
333
  #
232
- # /[csh](..) [csh]\1 in/.match("The cat sat in the hat")
233
- # #=> #<MatchData "cat sat in" 1:"at">
334
+ # * `/\R/`: Matches a linebreak, platform-independently:
234
335
  #
235
- # Regexp#match returns a MatchData object which makes the captured text
236
- # available with its #[] method:
336
+ # /\R/.match("\r") # => #<MatchData "\r"> # Carriage return (CR)
337
+ # /\R/.match("\n") # => #<MatchData "\n"> # Newline (LF)
338
+ # /\R/.match("\f") # => #<MatchData "\f"> # Formfeed (FF)
339
+ # /\R/.match("\v") # => #<MatchData "\v"> # Vertical tab (VT)
340
+ # /\R/.match("\r\n") # => #<MatchData "\r\n"> # CRLF
341
+ # /\R/.match("\u0085") # => #<MatchData "\u0085"> # Next line (NEL)
342
+ # /\R/.match("\u2028") # => #<MatchData "\u2028"> # Line separator (LSEP)
343
+ # /\R/.match("\u2029") # => #<MatchData "\u2029"> # Paragraph separator (PSEP)
237
344
  #
238
- # /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at'
239
345
  #
240
- # While Ruby supports an arbitrary number of numbered captured groups, only
241
- # groups 1-9 are supported using the `\n` backreference syntax.
346
+ # ### Anchors
242
347
  #
243
- # Ruby also supports `\0` as a special backreference, which references the
244
- # entire matched string. This is also available at `MatchData[0]`. Note that
245
- # the `\0` backreference cannot be used inside the regexp, as backreferences can
246
- # only be used after the end of the capture group, and the `\0` backreference
247
- # uses the implicit capture group of the entire match. However, you can use
248
- # this backreference when doing substitution:
348
+ # An anchor is a metasequence that matches a zero-width position between
349
+ # characters in the target string.
249
350
  #
250
- # "The cat sat in the hat".gsub(/[csh]at/, '\0s')
251
- # # => "The cats sats in the hats"
351
+ # For a subexpression with no anchor, matching may begin anywhere in the target
352
+ # string:
252
353
  #
253
- # ### Named Captures
354
+ # /real/.match('surrealist') # => #<MatchData "real">
254
355
  #
255
- # Capture groups can be referred to by name when defined with the
256
- # `(?<`*name*`>)` or `(?'`*name*`')` constructs.
356
+ # For a subexpression with an anchor, matching must begin at the matched anchor.
257
357
  #
258
- # /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")
259
- # #=> #<MatchData "$3.67" dollars:"3" cents:"67">
260
- # /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")[:dollars] #=> "3"
358
+ # #### Boundary Anchors
261
359
  #
262
- # Named groups can be backreferenced with `\k<`*name*`>`, where *name* is the
263
- # group name.
360
+ # Each of these anchors matches a boundary:
264
361
  #
265
- # /(?<vowel>[aeiou]).\k<vowel>.\k<vowel>/.match('ototomy')
266
- # #=> #<MatchData "ototo" vowel:"o">
362
+ # * `^`: Matches the beginning of a line:
363
+ #
364
+ # /^bar/.match("foo\nbar") # => #<MatchData "bar">
365
+ # /^ar/.match("foo\nbar") # => nil
366
+ #
367
+ # * `$`: Matches the end of a line:
368
+ #
369
+ # /bar$/.match("foo\nbar") # => #<MatchData "bar">
370
+ # /ba$/.match("foo\nbar") # => nil
371
+ #
372
+ # * `\A`: Matches the beginning of the string:
373
+ #
374
+ # /\Afoo/.match('foo bar') # => #<MatchData "foo">
375
+ # /\Afoo/.match(' foo bar') # => nil
376
+ #
377
+ # * `\Z`: Matches the end of the string; if string ends with a single newline,
378
+ # it matches just before the ending newline:
379
+ #
380
+ # /foo\Z/.match('bar foo') # => #<MatchData "foo">
381
+ # /foo\Z/.match('foo bar') # => nil
382
+ # /foo\Z/.match("bar foo\n") # => #<MatchData "foo">
383
+ # /foo\Z/.match("bar foo\n\n") # => nil
384
+ #
385
+ # * `\z`: Matches the end of the string:
386
+ #
387
+ # /foo\z/.match('bar foo') # => #<MatchData "foo">
388
+ # /foo\z/.match('foo bar') # => nil
389
+ # /foo\z/.match("bar foo\n") # => nil
390
+ #
391
+ # * `\b`: Matches word boundary when not inside brackets; matches backspace
392
+ # (`"0x08"`) when inside brackets:
393
+ #
394
+ # /foo\b/.match('foo bar') # => #<MatchData "foo">
395
+ # /foo\b/.match('foobar') # => nil
396
+ #
397
+ # * `\B`: Matches non-word boundary:
398
+ #
399
+ # /foo\B/.match('foobar') # => #<MatchData "foo">
400
+ # /foo\B/.match('foo bar') # => nil
401
+ #
402
+ # * `\G`: Matches first matching position:
403
+ #
404
+ # In methods like String#gsub and String#scan, it changes on each iteration.
405
+ # It initially matches the beginning of subject, and in each following
406
+ # iteration it matches where the last match finished.
407
+ #
408
+ # " a b c".gsub(/ /, '_') # => "____a_b_c"
409
+ # " a b c".gsub(/\G /, '_') # => "____a b c"
410
+ #
411
+ # In methods like Regexp#match and String#match that take an optional
412
+ # offset, it matches where the search begins.
413
+ #
414
+ # "hello, world".match(/,/, 3) # => #<MatchData ",">
415
+ # "hello, world".match(/\G,/, 3) # => nil
416
+ #
417
+ #
418
+ # #### Lookaround Anchors
419
+ #
420
+ # Lookahead anchors:
421
+ #
422
+ # * `(?=*pat*)`: Positive lookahead assertion: ensures that the following
423
+ # characters match *pat*, but doesn't include those characters in the
424
+ # matched substring.
425
+ #
426
+ # * `(?!*pat*)`: Negative lookahead assertion: ensures that the following
427
+ # characters *do not* match *pat*, but doesn't include those characters in
428
+ # the matched substring.
429
+ #
430
+ #
431
+ # Lookbehind anchors:
432
+ #
433
+ # * `(?<=*pat*)`: Positive lookbehind assertion: ensures that the preceding
434
+ # characters match *pat*, but doesn't include those characters in the
435
+ # matched substring.
436
+ #
437
+ # * `(?<!*pat*)`: Negative lookbehind assertion: ensures that the preceding
438
+ # characters do not match *pat*, but doesn't include those characters in the
439
+ # matched substring.
440
+ #
441
+ #
442
+ # The pattern below uses positive lookahead and positive lookbehind to match
443
+ # text appearing in **...** tags without including the tags in the match:
444
+ #
445
+ # /(?<=<b>)\w+(?=<\/b>)/.match("Fortune favors the <b>bold</b>.")
446
+ # # => #<MatchData "bold">
447
+ #
448
+ # #### Match-Reset Anchor
449
+ #
450
+ # * `\K`: Match reset: the matched content preceding `\K` in the regexp is
451
+ # excluded from the result. For example, the following two regexps are
452
+ # almost equivalent:
453
+ #
454
+ # /ab\Kc/.match('abc') # => #<MatchData "c">
455
+ # /(?<=ab)c/.match('abc') # => #<MatchData "c">
456
+ #
457
+ # These match same string and `$&` equals `'c'`, while the matched position
458
+ # is different.
459
+ #
460
+ # As are the following two regexps:
461
+ #
462
+ # /(a)\K(b)\Kc/
463
+ # /(?<=(?<=(a))(b))c/
464
+ #
465
+ #
466
+ # ### Alternation
467
+ #
468
+ # The vertical bar metacharacter (`|`) may be used within parentheses to express
469
+ # alternation: two or more subexpressions any of which may match the target
470
+ # string.
471
+ #
472
+ # Two alternatives:
473
+ #
474
+ # re = /(a|b)/
475
+ # re.match('foo') # => nil
476
+ # re.match('bar') # => #<MatchData "b" 1:"b">
477
+ #
478
+ # Four alternatives:
479
+ #
480
+ # re = /(a|b|c|d)/
481
+ # re.match('shazam') # => #<MatchData "a" 1:"a">
482
+ # re.match('cold') # => #<MatchData "c" 1:"c">
483
+ #
484
+ # Each alternative is a subexpression, and may be composed of other
485
+ # subexpressions:
486
+ #
487
+ # re = /([a-c]|[x-z])/
488
+ # re.match('bar') # => #<MatchData "b" 1:"b">
489
+ # re.match('ooz') # => #<MatchData "z" 1:"z">
490
+ #
491
+ # Method Regexp.union provides a convenient way to construct a regexp with
492
+ # alternatives.
493
+ #
494
+ # ### Quantifiers
495
+ #
496
+ # A simple regexp matches one character:
497
+ #
498
+ # /\w/.match('Hello') # => #<MatchData "H">
499
+ #
500
+ # An added *quantifier* specifies how many matches are required or allowed:
501
+ #
502
+ # * `*` - Matches zero or more times:
503
+ #
504
+ # /\w*/.match('')
505
+ # # => #<MatchData "">
506
+ # /\w*/.match('x')
507
+ # # => #<MatchData "x">
508
+ # /\w*/.match('xyz')
509
+ # # => #<MatchData "yz">
510
+ #
511
+ # * `+` - Matches one or more times:
512
+ #
513
+ # /\w+/.match('') # => nil
514
+ # /\w+/.match('x') # => #<MatchData "x">
515
+ # /\w+/.match('xyz') # => #<MatchData "xyz">
516
+ #
517
+ # * `?` - Matches zero or one times:
518
+ #
519
+ # /\w?/.match('') # => #<MatchData "">
520
+ # /\w?/.match('x') # => #<MatchData "x">
521
+ # /\w?/.match('xyz') # => #<MatchData "x">
522
+ #
523
+ # * `{`*n*`}` - Matches exactly *n* times:
524
+ #
525
+ # /\w{2}/.match('') # => nil
526
+ # /\w{2}/.match('x') # => nil
527
+ # /\w{2}/.match('xyz') # => #<MatchData "xy">
528
+ #
529
+ # * `{`*min*`,}` - Matches *min* or more times:
530
+ #
531
+ # /\w{2,}/.match('') # => nil
532
+ # /\w{2,}/.match('x') # => nil
533
+ # /\w{2,}/.match('xy') # => #<MatchData "xy">
534
+ # /\w{2,}/.match('xyz') # => #<MatchData "xyz">
535
+ #
536
+ # * `{,`*max*`}` - Matches *max* or fewer times:
537
+ #
538
+ # /\w{,2}/.match('') # => #<MatchData "">
539
+ # /\w{,2}/.match('x') # => #<MatchData "x">
540
+ # /\w{,2}/.match('xyz') # => #<MatchData "xy">
541
+ #
542
+ # * `{`*min*`,`*max*`}` - Matches at least *min* times and at most *max*
543
+ # times:
544
+ #
545
+ # /\w{1,2}/.match('') # => nil
546
+ # /\w{1,2}/.match('x') # => #<MatchData "x">
547
+ # /\w{1,2}/.match('xyz') # => #<MatchData "xy">
548
+ #
549
+ #
550
+ # #### Greedy, Lazy, or Possessive Matching
551
+ #
552
+ # Quantifier matching may be greedy, lazy, or possessive:
553
+ #
554
+ # * In *greedy* matching, as many occurrences as possible are matched while
555
+ # still allowing the overall match to succeed. Greedy quantifiers: `*`, `+`,
556
+ # `?`, `{min, max}` and its variants.
557
+ # * In *lazy* matching, the minimum number of occurrences are matched. Lazy
558
+ # quantifiers: `*?`, `+?`, `??`, `{min, max}?` and its variants.
559
+ # * In *possessive* matching, once a match is found, there is no backtracking;
560
+ # that match is retained, even if it jeopardises the overall match.
561
+ # Possessive quantifiers: `*+`, `++`, `?+`. Note that `{min, max}` and its
562
+ # variants do *not* support possessive matching.
563
+ #
564
+ #
565
+ # More:
566
+ #
567
+ # * About greedy and lazy matching, see [Choosing Minimal or Maximal
568
+ # Repetition](https://doc.lagout.org/programmation/Regular%20Expressions/Reg
569
+ # ular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Progr
570
+ # amming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%20201
571
+ # 2-09-06%5D.pdf#tutorial-backtrack).
572
+ # * About possessive matching, see [Eliminate Needless
573
+ # Backtracking](https://doc.lagout.org/programmation/Regular%20Expressions/R
574
+ # egular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Pro
575
+ # gramming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202
576
+ # 012-09-06%5D.pdf#tutorial-backtrack).
577
+ #
578
+ #
579
+ # ### Groups and Captures
580
+ #
581
+ # A simple regexp has (at most) one match:
582
+ #
583
+ # re = /\d\d\d\d-\d\d-\d\d/
584
+ # re.match('1943-02-04') # => #<MatchData "1943-02-04">
585
+ # re.match('1943-02-04').size # => 1
586
+ # re.match('foo') # => nil
587
+ #
588
+ # Adding one or more pairs of parentheses, `(*subexpression*)`, defines
589
+ # *groups*, which may result in multiple matched substrings, called *captures*:
590
+ #
591
+ # re = /(\d\d\d\d)-(\d\d)-(\d\d)/
592
+ # re.match('1943-02-04') # => #<MatchData "1943-02-04" 1:"1943" 2:"02" 3:"04">
593
+ # re.match('1943-02-04').size # => 4
594
+ #
595
+ # The first capture is the entire matched string; the other captures are the
596
+ # matched substrings from the groups.
597
+ #
598
+ # A group may have a [quantifier](rdoc-ref:Regexp@Quantifiers):
599
+ #
600
+ # re = /July 4(th)?/
601
+ # re.match('July 4') # => #<MatchData "July 4" 1:nil>
602
+ # re.match('July 4th') # => #<MatchData "July 4th" 1:"th">
603
+ #
604
+ # re = /(foo)*/
605
+ # re.match('') # => #<MatchData "" 1:nil>
606
+ # re.match('foo') # => #<MatchData "foo" 1:"foo">
607
+ # re.match('foofoo') # => #<MatchData "foofoo" 1:"foo">
608
+ #
609
+ # re = /(foo)+/
610
+ # re.match('') # => nil
611
+ # re.match('foo') # => #<MatchData "foo" 1:"foo">
612
+ # re.match('foofoo') # => #<MatchData "foofoo" 1:"foo">
613
+ #
614
+ # The returned MatchData object gives access to the matched substrings:
615
+ #
616
+ # re = /(\d\d\d\d)-(\d\d)-(\d\d)/
617
+ # md = re.match('1943-02-04')
618
+ # # => #<MatchData "1943-02-04" 1:"1943" 2:"02" 3:"04">
619
+ # md[0] # => "1943-02-04"
620
+ # md[1] # => "1943"
621
+ # md[2] # => "02"
622
+ # md[3] # => "04"
267
623
  #
268
- # **Note**: A regexp can't use named backreferences and numbered backreferences
269
- # simultaneously. Also, if a named capture is used in a regexp, then parentheses
270
- # used for grouping which would otherwise result in a unnamed capture are
271
- # treated as non-capturing.
624
+ # #### Non-Capturing Groups
272
625
  #
273
- # /(\w)(\w)/.match("ab").captures # => ["a", "b"]
274
- # /(\w)(\w)/.match("ab").named_captures # => {}
626
+ # A group may be made non-capturing; it is still a group (and, for example, can
627
+ # have a quantifier), but its matching substring is not included among the
628
+ # captures.
275
629
  #
276
- # /(?<c>\w)(\w)/.match("ab").captures # => ["a"]
277
- # /(?<c>\w)(\w)/.match("ab").named_captures # => {"c"=>"a"}
630
+ # A non-capturing group begins with `?:` (inside the parentheses):
278
631
  #
279
- # When named capture groups are used with a literal regexp on the left-hand side
280
- # of an expression and the `=~` operator, the captured text is also assigned to
281
- # local variables with corresponding names.
632
+ # # Don't capture the year.
633
+ # re = /(?:\d\d\d\d)-(\d\d)-(\d\d)/
634
+ # md = re.match('1943-02-04') # => #<MatchData "1943-02-04" 1:"02" 2:"04">
282
635
  #
283
- # /\$(?<dollars>\d+)\.(?<cents>\d+)/ =~ "$3.67" #=> 0
284
- # dollars #=> "3"
636
+ # #### Backreferences
285
637
  #
286
- # ## Grouping
638
+ # A group match may also be referenced within the regexp itself; such a
639
+ # reference is called a `backreference`:
287
640
  #
288
- # Parentheses also *group* the terms they enclose, allowing them to be
289
- # quantified as one *atomic* whole.
641
+ # /[csh](..) [csh]\1 in/.match('The cat sat in the hat')
642
+ # # => #<MatchData "cat sat in" 1:"at">
290
643
  #
291
- # The pattern below matches a vowel followed by 2 word characters:
644
+ # This table shows how each subexpression in the regexp above matches a
645
+ # substring in the target string:
292
646
  #
293
- # /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> #<MatchData "aen">
647
+ # | Subexpression in Regexp | Matching Substring in Target String |
648
+ # |---------------------------|-------------------------------------|
649
+ # | First '[csh]' | Character 'c' |
650
+ # | '(..)' | First substring 'at' |
651
+ # | First space ' ' | First space character ' ' |
652
+ # | Second '[csh]' | Character 's' |
653
+ # | '\1' (backreference 'at') | Second substring 'at' |
654
+ # | ' in' | Substring ' in' |
294
655
  #
295
- # Whereas the following pattern matches a vowel followed by a word character,
296
- # twice, i.e. `[aeiou]\w[aeiou]\w`: 'enor'.
656
+ # A regexp may contain any number of groups:
297
657
  #
298
- # /([aeiou]\w){2}/.match("Caenorhabditis elegans")
299
- # #=> #<MatchData "enor" 1:"or">
658
+ # * For a large number of groups:
300
659
  #
301
- # The `(?:`...`)` construct provides grouping without capturing. That is, it
302
- # combines the terms it contains into an atomic whole without creating a
303
- # backreference. This benefits performance at the slight expense of readability.
660
+ # * The ordinary `\*n`* notation applies only for *n* in range (1..9).
661
+ # * The `MatchData[*n*]` notation applies for any non-negative *n*.
304
662
  #
305
- # The first group of parentheses captures 'n' and the second 'ti'. The second
306
- # group is referred to later with the backreference `\2`:
307
663
  #
308
- # /I(n)ves(ti)ga\2ons/.match("Investigations")
309
- # #=> #<MatchData "Investigations" 1:"n" 2:"ti">
664
+ # * `\0` is a special backreference, referring to the entire matched string;
665
+ # it may not be used within the regexp itself, but may be used outside it
666
+ # (for example, in a substitution method call):
310
667
  #
311
- # The first group of parentheses is now made non-capturing with '?:', so it
312
- # still matches 'n', but doesn't create the backreference. Thus, the
313
- # backreference `\1` now refers to 'ti'.
668
+ # 'The cat sat in the hat'.gsub(/[csh]at/, '\0s')
669
+ # # => "The cats sats in the hats"
314
670
  #
315
- # /I(?:n)ves(ti)ga\1ons/.match("Investigations")
316
- # #=> #<MatchData "Investigations" 1:"ti">
317
671
  #
318
- # ### Atomic Grouping
672
+ # #### Named Captures
319
673
  #
320
- # Grouping can be made *atomic* with `(?>`*pat*`)`. This causes the
321
- # subexpression *pat* to be matched independently of the rest of the expression
322
- # such that what it matches becomes fixed for the remainder of the match, unless
323
- # the entire subexpression must be abandoned and subsequently revisited. In this
324
- # way *pat* is treated as a non-divisible whole. Atomic grouping is typically
325
- # used to optimise patterns so as to prevent the regular expression engine from
326
- # backtracking needlessly.
674
+ # As seen above, a capture can be referred to by its number. A capture can also
675
+ # have a name, prefixed as `?<*name*>` or `?'*name*'`, and the name (symbolized)
676
+ # may be used as an index in `MatchData[]`:
327
677
  #
328
- # The `"` in the pattern below matches the first character of the string, then
329
- # `.*` matches *Quote"*. This causes the overall match to fail, so the text
330
- # matched by `.*` is backtracked by one position, which leaves the final
331
- # character of the string available to match `"`
678
+ # md = /\$(?<dollars>\d+)\.(?'cents'\d+)/.match("$3.67")
679
+ # # => #<MatchData "$3.67" dollars:"3" cents:"67">
680
+ # md[:dollars] # => "3"
681
+ # md[:cents] # => "67"
682
+ # # The capture numbers are still valid.
683
+ # md[2] # => "67"
332
684
  #
333
- # /".*"/.match('"Quote"') #=> #<MatchData "\"Quote\"">
685
+ # When a regexp contains a named capture, there are no unnamed captures:
334
686
  #
335
- # If `.*` is grouped atomically, it refuses to backtrack *Quote"*, even though
336
- # this means that the overall match fails
687
+ # /\$(?<dollars>\d+)\.(\d+)/.match("$3.67")
688
+ # # => #<MatchData "$3.67" dollars:"3">
337
689
  #
338
- # /"(?>.*)"/.match('"Quote"') #=> nil
690
+ # A named group may be backreferenced as `\k<*name*>`:
339
691
  #
340
- # ## Subexpression Calls
692
+ # /(?<vowel>[aeiou]).\k<vowel>.\k<vowel>/.match('ototomy')
693
+ # # => #<MatchData "ototo" vowel:"o">
694
+ #
695
+ # When (and only when) a regexp contains named capture groups and appears before
696
+ # the `=~` operator, the captured substrings are assigned to local variables
697
+ # with corresponding names:
698
+ #
699
+ # /\$(?<dollars>\d+)\.(?<cents>\d+)/ =~ '$3.67'
700
+ # dollars # => "3"
701
+ # cents # => "67"
702
+ #
703
+ # Method Regexp#named_captures returns a hash of the capture names and
704
+ # substrings; method Regexp#names returns an array of the capture names.
705
+ #
706
+ # #### Atomic Grouping
707
+ #
708
+ # A group may be made *atomic* with `(?>`*subexpression*`)`.
709
+ #
710
+ # This causes the subexpression to be matched independently of the rest of the
711
+ # expression, so that the matched substring becomes fixed for the remainder of
712
+ # the match, unless the entire subexpression must be abandoned and subsequently
713
+ # revisited.
714
+ #
715
+ # In this way *subexpression* is treated as a non-divisible whole. Atomic
716
+ # grouping is typically used to optimise patterns to prevent needless
717
+ # backtracking .
718
+ #
719
+ # Example (without atomic grouping):
341
720
  #
342
- # The `\g<`*name*`>` syntax matches the previous subexpression named *name*,
343
- # which can be a group name or number, again. This differs from backreferences
344
- # in that it re-executes the group rather than simply trying to re-match the
345
- # same text.
721
+ # /".*"/.match('"Quote"') # => #<MatchData "\"Quote\"">
346
722
  #
347
- # This pattern matches a *(* character and assigns it to the `paren` group,
348
- # tries to call that the `paren` sub-expression again but fails, then matches a
349
- # literal *)*:
723
+ # Analysis:
350
724
  #
351
- # /\A(?<paren>\(\g<paren>*\))*\z/ =~ '()'
725
+ # 1. The leading subexpression `"` in the pattern matches the first character
726
+ # `"` in the target string.
727
+ # 2. The next subexpression `.*` matches the next substring `Quote“` (including
728
+ # the trailing double-quote).
729
+ # 3. Now there is nothing left in the target string to match the trailing
730
+ # subexpression `"` in the pattern; this would cause the overall match to
731
+ # fail.
732
+ # 4. The matched substring is backtracked by one position: `Quote`.
733
+ # 5. The final subexpression `"` now matches the final substring `"`, and the
734
+ # overall match succeeds.
352
735
  #
353
- # /\A(?<paren>\(\g<paren>*\))*\z/ =~ '(())' #=> 0
736
+ #
737
+ # If subexpression `.*` is grouped atomically, the backtracking is disabled, and
738
+ # the overall match fails:
739
+ #
740
+ # /"(?>.*)"/.match('"Quote"') # => nil
741
+ #
742
+ # Atomic grouping can affect performance; see [Atomic
743
+ # Group](https://www.regular-expressions.info/atomic.html).
744
+ #
745
+ # #### Subexpression Calls
746
+ #
747
+ # As seen above, a backreference number (`\*n`*) or name (`\k<*name*>`) gives
748
+ # access to a captured *substring*; the corresponding regexp *subexpression* may
749
+ # also be accessed, via the number (`\\g*n`*) or name (`\g<*name*>`):
750
+ #
751
+ # /\A(?<paren>\(\g<paren>*\))*\z/.match('(())')
354
752
  # # ^1
355
753
  # # ^2
356
754
  # # ^3
@@ -362,415 +760,576 @@
362
760
  # # ^9
363
761
  # # ^10
364
762
  #
763
+ # The pattern:
764
+ #
365
765
  # 1. Matches at the beginning of the string, i.e. before the first character.
366
- # 2. Enters a named capture group called `paren`
367
- # 3. Matches a literal *(*, the first character in the string
368
- # 4. Calls the `paren` group again, i.e. recurses back to the second step
369
- # 5. Re-enters the `paren` group
370
- # 6. Matches a literal *(*, the second character in the string
371
- # 7. Try to call `paren` a third time, but fail because doing so would prevent
372
- # an overall successful match
373
- # 8. Match a literal *)*, the third character in the string. Marks the end of
374
- # the second recursive call
375
- # 9. Match a literal *)*, the fourth character in the string
376
- # 10. Match the end of the string
377
- #
378
- #
379
- # ## Alternation
380
- #
381
- # The vertical bar metacharacter (`|`) combines several expressions into a
382
- # single one that matches any of the expressions. Each expression is an
383
- # *alternative*.
384
- #
385
- # /\w(and|or)\w/.match("Feliformia") #=> #<MatchData "form" 1:"or">
386
- # /\w(and|or)\w/.match("furandi") #=> #<MatchData "randi" 1:"and">
387
- # /\w(and|or)\w/.match("dissemblance") #=> nil
388
- #
389
- # ## Character Properties
390
- #
391
- # The `\p{}` construct matches characters with the named property, much like
392
- # POSIX bracket classes.
393
- #
394
- # * `/\p{Alnum}/` - Alphabetic and numeric character
395
- # * `/\p{Alpha}/` - Alphabetic character
396
- # * `/\p{Blank}/` - Space or tab
397
- # * `/\p{Cntrl}/` - Control character
398
- # * `/\p{Digit}/` - Digit
399
- # * `/\p{Emoji}/` - Unicode emoji
400
- # * `/\p{Graph}/` - Non-blank character (excludes spaces, control characters,
401
- # and similar)
402
- # * `/\p{Lower}/` - Lowercase alphabetical character
403
- # * `/\p{Print}/` - Like `\p{Graph}`, but includes the space character
404
- # * `/\p{Punct}/` - Punctuation character
405
- # * `/\p{Space}/` - Whitespace character (`[:blank:]`, newline, carriage
766
+ # 2. Enters a named group `paren`.
767
+ # 3. Matches the first character in the string, `'('`.
768
+ # 4. Calls the `paren` group again, i.e. recurses back to the second step.
769
+ # 5. Re-enters the `paren` group.
770
+ # 6. Matches the second character in the string, `'('`.
771
+ # 7. Attempts to call `paren` a third time, but fails because doing so would
772
+ # prevent an overall successful match.
773
+ # 8. Matches the third character in the string, `')'`; marks the end of the
774
+ # second recursive call
775
+ # 9. Matches the fourth character in the string, `')'`.
776
+ # 10. Matches the end of the string.
777
+ #
778
+ #
779
+ # See [Subexpression
780
+ # calls](https://learnbyexample.github.io/Ruby_Regexp/groupings-and-backreferenc
781
+ # es.html?highlight=subexpression#subexpression-calls).
782
+ #
783
+ # #### Conditionals
784
+ #
785
+ # The conditional construct takes the form `(?(*cond*)*yes*|*no*)`, where:
786
+ #
787
+ # * *cond* may be a capture number or name.
788
+ # * The match to be applied is *yes* if *cond* is captured; otherwise the
789
+ # match to be applied is *no*.
790
+ # * If not needed, `|*no`* may be omitted.
791
+ #
792
+ #
793
+ # Examples:
794
+ #
795
+ # re = /\A(foo)?(?(1)(T)|(F))\z/
796
+ # re.match('fooT') # => #<MatchData "fooT" 1:"foo" 2:"T" 3:nil>
797
+ # re.match('F') # => #<MatchData "F" 1:nil 2:nil 3:"F">
798
+ # re.match('fooF') # => nil
799
+ # re.match('T') # => nil
800
+ #
801
+ # re = /\A(?<xyzzy>foo)?(?(<xyzzy>)(T)|(F))\z/
802
+ # re.match('fooT') # => #<MatchData "fooT" xyzzy:"foo">
803
+ # re.match('F') # => #<MatchData "F" xyzzy:nil>
804
+ # re.match('fooF') # => nil
805
+ # re.match('T') # => nil
806
+ #
807
+ # #### Absence Operator
808
+ #
809
+ # The absence operator is a special group that matches anything which does *not*
810
+ # match the contained subexpressions.
811
+ #
812
+ # /(?~real)/.match('surrealist') # => #<MatchData "surrea">
813
+ # /(?~real)ist/.match('surrealist') # => #<MatchData "ealist">
814
+ # /sur(?~real)ist/.match('surrealist') # => nil
815
+ #
816
+ # ### Unicode
817
+ #
818
+ # #### Unicode Properties
819
+ #
820
+ # The `/\p{*property_name*}/` construct (with lowercase `p`) matches characters
821
+ # using a Unicode property name, much like a character class; property `Alpha`
822
+ # specifies alphabetic characters:
823
+ #
824
+ # /\p{Alpha}/.match('a') # => #<MatchData "a">
825
+ # /\p{Alpha}/.match('1') # => nil
826
+ #
827
+ # A property can be inverted by prefixing the name with a caret character (`^`):
828
+ #
829
+ # /\p{^Alpha}/.match('1') # => #<MatchData "1">
830
+ # /\p{^Alpha}/.match('a') # => nil
831
+ #
832
+ # Or by using `\P` (uppercase `P`):
833
+ #
834
+ # /\P{Alpha}/.match('1') # => #<MatchData "1">
835
+ # /\P{Alpha}/.match('a') # => nil
836
+ #
837
+ # See [Unicode Properties](rdoc-ref:regexp/unicode_properties.rdoc) for regexps
838
+ # based on the numerous properties.
839
+ #
840
+ # Some commonly-used properties correspond to POSIX bracket expressions:
841
+ #
842
+ # * `/\p{Alnum}/`: Alphabetic and numeric character
843
+ # * `/\p{Alpha}/`: Alphabetic character
844
+ # * `/\p{Blank}/`: Space or tab
845
+ # * `/\p{Cntrl}/`: Control character
846
+ # * `/\p{Digit}/`: Digit characters, and similar)
847
+ # * `/\p{Lower}/`: Lowercase alphabetical character
848
+ # * `/\p{Print}/`: Like `\p{Graph}`, but includes the space character
849
+ # * `/\p{Punct}/`: Punctuation character
850
+ # * `/\p{Space}/`: Whitespace character (`[:blank:]`, newline, carriage
406
851
  # return, etc.)
407
- # * `/\p{Upper}/` - Uppercase alphabetical
408
- # * `/\p{XDigit}/` - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
409
- # * `/\p{Word}/` - A member of one of the following Unicode general category
410
- # *Letter*, *Mark*, *Number*, *Connector_Punctuation*
411
- # * `/\p{ASCII}/` - A character in the ASCII character set
412
- # * `/\p{Any}/` - Any Unicode character (including unassigned characters)
413
- # * `/\p{Assigned}/` - An assigned character
414
- #
415
- #
416
- # A Unicode character's *General Category* value can also be matched with
417
- # `\p{`*Ab*`}` where *Ab* is the category's abbreviation as described below:
418
- #
419
- # * `/\p{L}/` - 'Letter'
420
- # * `/\p{Ll}/` - 'Letter: Lowercase'
421
- # * `/\p{Lm}/` - 'Letter: Mark'
422
- # * `/\p{Lo}/` - 'Letter: Other'
423
- # * `/\p{Lt}/` - 'Letter: Titlecase'
424
- # * `/\p{Lu}/` - 'Letter: Uppercase
425
- # * `/\p{Lo}/` - 'Letter: Other'
426
- # * `/\p{M}/` - 'Mark'
427
- # * `/\p{Mn}/` - 'Mark: Nonspacing'
428
- # * `/\p{Mc}/` - 'Mark: Spacing Combining'
429
- # * `/\p{Me}/` - 'Mark: Enclosing'
430
- # * `/\p{N}/` - 'Number'
431
- # * `/\p{Nd}/` - 'Number: Decimal Digit'
432
- # * `/\p{Nl}/` - 'Number: Letter'
433
- # * `/\p{No}/` - 'Number: Other'
434
- # * `/\p{P}/` - 'Punctuation'
435
- # * `/\p{Pc}/` - 'Punctuation: Connector'
436
- # * `/\p{Pd}/` - 'Punctuation: Dash'
437
- # * `/\p{Ps}/` - 'Punctuation: Open'
438
- # * `/\p{Pe}/` - 'Punctuation: Close'
439
- # * `/\p{Pi}/` - 'Punctuation: Initial Quote'
440
- # * `/\p{Pf}/` - 'Punctuation: Final Quote'
441
- # * `/\p{Po}/` - 'Punctuation: Other'
442
- # * `/\p{S}/` - 'Symbol'
443
- # * `/\p{Sm}/` - 'Symbol: Math'
444
- # * `/\p{Sc}/` - 'Symbol: Currency'
445
- # * `/\p{Sc}/` - 'Symbol: Currency'
446
- # * `/\p{Sk}/` - 'Symbol: Modifier'
447
- # * `/\p{So}/` - 'Symbol: Other'
448
- # * `/\p{Z}/` - 'Separator'
449
- # * `/\p{Zs}/` - 'Separator: Space'
450
- # * `/\p{Zl}/` - 'Separator: Line'
451
- # * `/\p{Zp}/` - 'Separator: Paragraph'
452
- # * `/\p{C}/` - 'Other'
453
- # * `/\p{Cc}/` - 'Other: Control'
454
- # * `/\p{Cf}/` - 'Other: Format'
455
- # * `/\p{Cn}/` - 'Other: Not Assigned'
456
- # * `/\p{Co}/` - 'Other: Private Use'
457
- # * `/\p{Cs}/` - 'Other: Surrogate'
458
- #
459
- #
460
- # Lastly, `\p{}` matches a character's Unicode *script*. The following scripts
461
- # are supported: *Arabic*, *Armenian*, *Balinese*, *Bengali*, *Bopomofo*,
462
- # *Braille*, *Buginese*, *Buhid*, *Canadian_Aboriginal*, *Carian*, *Cham*,
463
- # *Cherokee*, *Common*, *Coptic*, *Cuneiform*, *Cypriot*, *Cyrillic*, *Deseret*,
464
- # *Devanagari*, *Ethiopic*, *Georgian*, *Glagolitic*, *Gothic*, *Greek*,
465
- # *Gujarati*, *Gurmukhi*, *Han*, *Hangul*, *Hanunoo*, *Hebrew*, *Hiragana*,
466
- # *Inherited*, *Kannada*, *Katakana*, *Kayah_Li*, *Kharoshthi*, *Khmer*, *Lao*,
467
- # *Latin*, *Lepcha*, *Limbu*, *Linear_B*, *Lycian*, *Lydian*, *Malayalam*,
468
- # *Mongolian*, *Myanmar*, *New_Tai_Lue*, *Nko*, *Ogham*, *Ol_Chiki*,
469
- # *Old_Italic*, *Old_Persian*, *Oriya*, *Osmanya*, *Phags_Pa*, *Phoenician*,
470
- # *Rejang*, *Runic*, *Saurashtra*, *Shavian*, *Sinhala*, *Sundanese*,
471
- # *Syloti_Nagri*, *Syriac*, *Tagalog*, *Tagbanwa*, *Tai_Le*, *Tamil*, *Telugu*,
472
- # *Thaana*, *Thai*, *Tibetan*, *Tifinagh*, *Ugaritic*, *Vai*, and *Yi*.
473
- #
474
- # Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and belongs to the
475
- # Arabic script:
476
- #
477
- # /\p{Arabic}/.match("\u06E9") #=> #<MatchData "\u06E9">
478
- #
479
- # All character properties can be inverted by prefixing their name with a caret
480
- # (`^`).
481
- #
482
- # Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so this
483
- # match succeeds:
484
- #
485
- # /\p{^Ll}/.match("A") #=> #<MatchData "A">
486
- #
487
- # ## Anchors
488
- #
489
- # Anchors are metacharacter that match the zero-width positions between
490
- # characters, *anchoring* the match to a specific position.
491
- #
492
- # * `^` - Matches beginning of line
493
- # * `$` - Matches end of line
494
- # * `\A` - Matches beginning of string.
495
- # * `\Z` - Matches end of string. If string ends with a newline, it matches
496
- # just before newline
497
- # * `\z` - Matches end of string
498
- # * `\G` - Matches first matching position:
499
- #
500
- # In methods like `String#gsub` and `String#scan`, it changes on each
501
- # iteration. It initially matches the beginning of subject, and in each
502
- # following iteration it matches where the last match finished.
503
- #
504
- # " a b c".gsub(/ /, '_') #=> "____a_b_c"
505
- # " a b c".gsub(/\G /, '_') #=> "____a b c"
506
- #
507
- # In methods like `Regexp#match` and `String#match` that take an (optional)
508
- # offset, it matches where the search begins.
852
+ # * `/\p{Upper}/`: Uppercase alphabetical
853
+ # * `/\p{XDigit}/`: Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
509
854
  #
510
- # "hello, world".match(/,/, 3) #=> #<MatchData ",">
511
- # "hello, world".match(/\G,/, 3) #=> nil
512
855
  #
513
- # * `\b` - Matches word boundaries when outside brackets; backspace (0x08)
514
- # when inside brackets
515
- # * `\B` - Matches non-word boundaries
516
- # * `(?=`*pat*`)` - *Positive lookahead* assertion: ensures that the following
517
- # characters match *pat*, but doesn't include those characters in the
518
- # matched text
519
- # * `(?!`*pat*`)` - *Negative lookahead* assertion: ensures that the following
520
- # characters do not match *pat*, but doesn't include those characters in the
521
- # matched text
522
- # * `(?<=`*pat*`)` - *Positive lookbehind* assertion: ensures that the
523
- # preceding characters match *pat*, but doesn't include those characters in
524
- # the matched text
525
- # * `(?<!`*pat*`)` - *Negative lookbehind* assertion: ensures that the
526
- # preceding characters do not match *pat*, but doesn't include those
527
- # characters in the matched text
528
- #
529
- # * `\K` - *Match reset*: the matched content preceding `\K` in the regexp is
530
- # excluded from the result. For example, the following two regexps are
531
- # almost equivalent:
856
+ # These are also commonly used:
532
857
  #
533
- # /ab\Kc/ =~ "abc" #=> 0
534
- # /(?<=ab)c/ =~ "abc" #=> 2
858
+ # * `/\p{Emoji}/`: Unicode emoji.
859
+ # * `/\p{Graph}/`: Non-blank character (excludes spaces, control characters,
860
+ # and similar).
861
+ # * `/\p{Word}/`: A member in one of these Unicode character categories (see
862
+ # below) or having one of these Unicode properties:
535
863
  #
536
- # These match same string and *$&* equals `"c"`, while the matched position
537
- # is different.
864
+ # * Unicode categories:
865
+ # * `Mark` (`M`).
866
+ # * `Decimal Number` (`Nd`)
867
+ # * `Connector Punctuation` (`Pc`).
538
868
  #
539
- # As are the following two regexps:
540
869
  #
541
- # /(a)\K(b)\Kc/
542
- # /(?<=(?<=(a))(b))c/
870
+ # * Unicode properties:
871
+ # * `Alpha`
872
+ # * `Join_Control`
543
873
  #
544
874
  #
545
- # If a pattern isn't anchored it can begin at any point in the string:
546
875
  #
547
- # /real/.match("surrealist") #=> #<MatchData "real">
876
+ # * `/\p{ASCII}/`: A character in the ASCII character set.
877
+ # * `/\p{Any}/`: Any Unicode character (including unassigned characters).
878
+ # * `/\p{Assigned}/`: An assigned character.
548
879
  #
549
- # Anchoring the pattern to the beginning of the string forces the match to start
550
- # there. 'real' doesn't occur at the beginning of the string, so now the match
551
- # fails:
552
880
  #
553
- # /\Areal/.match("surrealist") #=> nil
881
+ # #### Unicode Character Categories
554
882
  #
555
- # The match below fails because although 'Demand' contains 'and', the pattern
556
- # does not occur at a word boundary.
883
+ # A Unicode character category name:
557
884
  #
558
- # /\band/.match("Demand")
885
+ # * May be either its full name or its abbreviated name.
886
+ # * Is case-insensitive.
887
+ # * Treats a space, a hyphen, and an underscore as equivalent.
559
888
  #
560
- # Whereas in the following example 'and' has been anchored to a non-word
561
- # boundary so instead of matching the first 'and' it matches from the fourth
562
- # letter of 'demand' instead:
563
889
  #
564
- # /\Band.+/.match("Supply and demand curve") #=> #<MatchData "and curve">
890
+ # Examples:
565
891
  #
566
- # The pattern below uses positive lookahead and positive lookbehind to match
567
- # text appearing in tags without including the tags in the match:
892
+ # /\p{lu}/ # => /\p{lu}/
893
+ # /\p{LU}/ # => /\p{LU}/
894
+ # /\p{Uppercase Letter}/ # => /\p{Uppercase Letter}/
895
+ # /\p{Uppercase_Letter}/ # => /\p{Uppercase_Letter}/
896
+ # /\p{UPPERCASE-LETTER}/ # => /\p{UPPERCASE-LETTER}/
568
897
  #
569
- # /(?<=<b>)\w+(?=<\/b>)/.match("Fortune favours the <b>bold</b>")
570
- # #=> #<MatchData "bold">
898
+ # Below are the Unicode character category abbreviations and names. Enumerations
899
+ # of characters in each category are at the links.
571
900
  #
572
- # ## Options
901
+ # Letters:
573
902
  #
574
- # The end delimiter for a regexp can be followed by one or more single-letter
575
- # options which control how the pattern can match.
903
+ # * `L`, `Letter`: `LC`, `Lm`, or `Lo`.
904
+ # * `LC`, `Cased_Letter`: `Ll`, `Lt`, or `Lu`.
905
+ # * [Lu, Lowercase_Letter](https://www.compart.com/en/unicode/category/Ll).
906
+ # * [Lu, Modifier_Letter](https://www.compart.com/en/unicode/category/Lm).
907
+ # * [Lu, Other_Letter](https://www.compart.com/en/unicode/category/Lo).
908
+ # * [Lu, Titlecase_Letter](https://www.compart.com/en/unicode/category/Lt).
909
+ # * [Lu, Uppercase_Letter](https://www.compart.com/en/unicode/category/Lu).
576
910
  #
577
- # * `/pat/i` - Ignore case
578
- # * `/pat/m` - Treat a newline as a character matched by `.`
579
- # * `/pat/x` - Ignore whitespace and comments in the pattern
580
- # * `/pat/o` - Perform `#{}` interpolation only once
581
911
  #
912
+ # Marks:
582
913
  #
583
- # `i`, `m`, and `x` can also be applied on the subexpression level with the
584
- # `(?`*on*`-`*off*`)` construct, which enables options *on*, and disables
585
- # options *off* for the expression enclosed by the parentheses:
914
+ # * `M`, `Mark`: `Mc`, `Me`, or `Mn`.
915
+ # * [Mc, Spacing_Mark](https://www.compart.com/en/unicode/category/Mc).
916
+ # * [Me, Enclosing_Mark](https://www.compart.com/en/unicode/category/Me).
917
+ # * [Mn, Nonapacing_Mark](https://www.compart.com/en/unicode/category/Mn).
586
918
  #
587
- # /a(?i:b)c/.match('aBc') #=> #<MatchData "aBc">
588
- # /a(?-i:b)c/i.match('ABC') #=> nil
589
919
  #
590
- # Additionally, these options can also be toggled for the remainder of the
591
- # pattern:
920
+ # Numbers:
592
921
  #
593
- # /a(?i)bc/.match('abC') #=> #<MatchData "abC">
922
+ # * `N`, `Number`: `Nd`, `Nl`, or `No`.
923
+ # * [Nd, Decimal_Number](https://www.compart.com/en/unicode/category/Nd).
924
+ # * [Nl, Letter_Number](https://www.compart.com/en/unicode/category/Nl).
925
+ # * [No, Other_Number](https://www.compart.com/en/unicode/category/No).
594
926
  #
595
- # Options may also be used with `Regexp.new`:
596
927
  #
597
- # Regexp.new("abc", Regexp::IGNORECASE) #=> /abc/i
598
- # Regexp.new("abc", Regexp::MULTILINE) #=> /abc/m
599
- # Regexp.new("abc # Comment", Regexp::EXTENDED) #=> /abc # Comment/x
600
- # Regexp.new("abc", Regexp::IGNORECASE | Regexp::MULTILINE) #=> /abc/mi
928
+ # Punctation:
601
929
  #
602
- # Regexp.new("abc", "i") #=> /abc/i
603
- # Regexp.new("abc", "m") #=> /abc/m
604
- # Regexp.new("abc # Comment", "x") #=> /abc # Comment/x
605
- # Regexp.new("abc", "im") #=> /abc/mi
930
+ # * `P`, `Punctuation`: `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`.
931
+ # * [Pc,
932
+ # Connector_Punctuation](https://www.compart.com/en/unicode/category/Pc).
933
+ # * [Pd, Dash_Punctuation](https://www.compart.com/en/unicode/category/Pd).
934
+ # * [Pe, Close_Punctuation](https://www.compart.com/en/unicode/category/Pe).
935
+ # * [Pf, Final_Punctuation](https://www.compart.com/en/unicode/category/Pf).
936
+ # * [Pi, Initial_Punctuation](https://www.compart.com/en/unicode/category/Pi).
937
+ # * [Po, Other_Punctuation](https://www.compart.com/en/unicode/category/Po).
938
+ # * [Ps, Open_Punctuation](https://www.compart.com/en/unicode/category/Ps).
606
939
  #
607
- # ## Free-Spacing Mode and Comments
940
+ # * `S`, `Symbol`: `Sc`, `Sk`, `Sm`, or `So`.
941
+ # * [Sc, Currency_Symbol](https://www.compart.com/en/unicode/category/Sc).
942
+ # * [Sk, Modifier_Symbol](https://www.compart.com/en/unicode/category/Sk).
943
+ # * [Sm, Math_Symbol](https://www.compart.com/en/unicode/category/Sm).
944
+ # * [So, Other_Symbol](https://www.compart.com/en/unicode/category/So).
608
945
  #
609
- # As mentioned above, the `x` option enables *free-spacing* mode. Literal white
610
- # space inside the pattern is ignored, and the octothorpe (`#`) character
611
- # introduces a comment until the end of the line. This allows the components of
612
- # the pattern to be organized in a potentially more readable fashion.
946
+ # * `Z`, `Separator`: `Zl`, `Zp`, or `Zs`.
947
+ # * [Zl, Line_Separator](https://www.compart.com/en/unicode/category/Zl).
948
+ # * [Zp, Paragraph_Separator](https://www.compart.com/en/unicode/category/Zp).
949
+ # * [Zs, Space_Separator](https://www.compart.com/en/unicode/category/Zs).
613
950
  #
614
- # A contrived pattern to match a number with optional decimal places:
951
+ # * `C`, `Other`: `Cc`, `Cf`, `Cn`, `Co`, or `Cs`.
952
+ # * [Cc, Control](https://www.compart.com/en/unicode/category/Cc).
953
+ # * [Cf, Format](https://www.compart.com/en/unicode/category/Cf).
954
+ # * [Cn, Unassigned](https://www.compart.com/en/unicode/category/Cn).
955
+ # * [Co, Private_Use](https://www.compart.com/en/unicode/category/Co).
956
+ # * [Cs, Surrogate](https://www.compart.com/en/unicode/category/Cs).
615
957
  #
616
- # float_pat = /\A
617
- # [[:digit:]]+ # 1 or more digits before the decimal point
618
- # (\. # Decimal point
619
- # [[:digit:]]+ # 1 or more digits after the decimal point
620
- # )? # The decimal point and following digits are optional
621
- # \Z/x
622
- # float_pat.match('3.14') #=> #<MatchData "3.14" 1:".14">
623
958
  #
624
- # There are a number of strategies for matching whitespace:
959
+ # #### Unicode Scripts and Blocks
625
960
  #
626
- # * Use a pattern such as `\s` or `\p{Space}`.
627
- # * Use escaped whitespace such as `\ `, i.e. a space preceded by a backslash.
628
- # * Use a character class such as `[ ]`.
961
+ # Among the Unicode properties are:
629
962
  #
963
+ # * [Unicode scripts](https://en.wikipedia.org/wiki/Script_(Unicode)); see
964
+ # [supported scripts](https://www.unicode.org/standard/supported.html).
965
+ # * [Unicode blocks](https://en.wikipedia.org/wiki/Unicode_block); see
966
+ # [supported blocks](http://www.unicode.org/Public/UNIDATA/Blocks.txt).
630
967
  #
631
- # Comments can be included in a non-`x` pattern with the `(?#`*comment*`)`
632
- # construct, where *comment* is arbitrary text ignored by the regexp engine.
633
968
  #
634
- # Comments in regexp literals cannot include unescaped terminator characters.
969
+ # ### POSIX Bracket Expressions
635
970
  #
636
- # ## Encoding
971
+ # A POSIX *bracket expression* is also similar to a character class. These
972
+ # expressions provide a portable alternative to the above, with the added
973
+ # benefit of encompassing non-ASCII characters:
637
974
  #
638
- # Regular expressions are assumed to use the source encoding. This can be
639
- # overridden with one of the following modifiers.
975
+ # * `/\d/` matches only ASCII decimal digits `0` through `9`.
976
+ # * `/[[:digit:]]/` matches any character in the Unicode `Decimal Number`
977
+ # (`Nd`) category; see below.
640
978
  #
641
- # * `/`*pat*`/u` - UTF-8
642
- # * `/`*pat*`/e` - EUC-JP
643
- # * `/`*pat*`/s` - Windows-31J
644
- # * `/`*pat*`/n` - ASCII-8BIT
645
979
  #
980
+ # The POSIX bracket expressions:
646
981
  #
647
- # A regexp can be matched against a string when they either share an encoding,
648
- # or the regexp's encoding is *US-ASCII* and the string's encoding is
649
- # ASCII-compatible.
982
+ # * `/[[:digit:]]/`: Matches a [Unicode
983
+ # digit](https://www.compart.com/en/unicode/category/Nd):
650
984
  #
651
- # If a match between incompatible encodings is attempted an
652
- # `Encoding::CompatibilityError` exception is raised.
985
+ # /[[:digit:]]/.match('9') # => #<MatchData "9">
986
+ # /[[:digit:]]/.match("\u1fbf9") # => #<MatchData "9">
987
+ #
988
+ # * `/[[:xdigit:]]/`: Matches a digit allowed in a hexadecimal number;
989
+ # equivalent to `[0-9a-fA-F]`.
990
+ #
991
+ # * `/[[:upper:]]/`: Matches a [Unicode uppercase
992
+ # letter](https://www.compart.com/en/unicode/category/Lu):
993
+ #
994
+ # /[[:upper:]]/.match('A') # => #<MatchData "A">
995
+ # /[[:upper:]]/.match("\u00c6") # => #<MatchData "Æ">
996
+ #
997
+ # * `/[[:lower:]]/`: Matches a [Unicode lowercase
998
+ # letter](https://www.compart.com/en/unicode/category/Ll):
999
+ #
1000
+ # /[[:lower:]]/.match('a') # => #<MatchData "a">
1001
+ # /[[:lower:]]/.match("\u01fd") # => #<MatchData "ǽ">
1002
+ #
1003
+ # * `/[[:alpha:]]/`: Matches `/[[:upper:]]/` or `/[[:lower:]]/`.
1004
+ #
1005
+ # * `/[[:alnum:]]/`: Matches `/[[:alpha:]]/` or `/[[:digit:]]/`.
1006
+ #
1007
+ # * `/[[:space:]]/`: Matches [Unicode space
1008
+ # character](https://www.compart.com/en/unicode/category/Zs):
1009
+ #
1010
+ # /[[:space:]]/.match(' ') # => #<MatchData " ">
1011
+ # /[[:space:]]/.match("\u2005") # => #<MatchData " ">
1012
+ #
1013
+ # * `/[[:blank:]]/`: Matches `/[[:space:]]/` or tab character:
1014
+ #
1015
+ # /[[:blank:]]/.match(' ') # => #<MatchData " ">
1016
+ # /[[:blank:]]/.match("\u2005") # => #<MatchData " ">
1017
+ # /[[:blank:]]/.match("\t") # => #<MatchData "\t">
1018
+ #
1019
+ # * `/[[:cntrl:]]/`: Matches [Unicode control
1020
+ # character](https://www.compart.com/en/unicode/category/Cc):
1021
+ #
1022
+ # /[[:cntrl:]]/.match("\u0000") # => #<MatchData "\u0000">
1023
+ # /[[:cntrl:]]/.match("\u009f") # => #<MatchData "\u009F">
1024
+ #
1025
+ # * `/[[:graph:]]/`: Matches any character except `/[[:space:]]/` or
1026
+ # `/[[:cntrl:]]/`.
1027
+ #
1028
+ # * `/[[:print:]]/`: Matches `/[[:graph:]]/` or space character.
1029
+ #
1030
+ # * `/[[:punct:]]/`: Matches any (Unicode punctuation
1031
+ # character}[https://www.compart.com/en/unicode/category/Po]:
1032
+ #
1033
+ #
1034
+ # Ruby also supports these (non-POSIX) bracket expressions:
1035
+ #
1036
+ # * `/[[:ascii:]]/`: Matches a character in the ASCII character set.
1037
+ # * `/[[:word:]]/`: Matches a character in one of these Unicode character
1038
+ # categories or having one of these Unicode properties:
1039
+ #
1040
+ # * Unicode categories:
1041
+ # * `Mark` (`M`).
1042
+ # * `Decimal Number` (`Nd`)
1043
+ # * `Connector Punctuation` (`Pc`).
1044
+ #
1045
+ #
1046
+ # * Unicode properties:
1047
+ # * `Alpha`
1048
+ # * `Join_Control`
653
1049
  #
654
- # The `Regexp#fixed_encoding?` predicate indicates whether the regexp has a
655
- # *fixed* encoding, that is one incompatible with ASCII. A regexp's encoding can
656
- # be explicitly fixed by supplying `Regexp::FIXEDENCODING` as the second
657
- # argument of `Regexp.new`:
658
1050
  #
659
- # r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING)
660
- # r =~ "a\u3042"
661
- # # raises Encoding::CompatibilityError: incompatible encoding regexp match
662
- # # (ISO-8859-1 regexp with UTF-8 string)
663
1051
  #
664
- # ## Regexp Global Variables
665
1052
  #
666
- # Pattern matching sets some global variables :
1053
+ # ### Comments
667
1054
  #
668
- # * `$~` is equivalent to Regexp.last_match;
669
- # * `$&` contains the complete matched text;
670
- # * `$`` contains string before match;
671
- # * `$'` contains string after match;
672
- # * `$1`, `$2` and so on contain text matching first, second, etc capture
673
- # group;
674
- # * `$+` contains last capture group.
1055
+ # A comment may be included in a regexp pattern using the `(?#`*comment*`)`
1056
+ # construct, where *comment* is a substring that is to be ignored. arbitrary
1057
+ # text ignored by the regexp engine:
675
1058
  #
1059
+ # /foo(?#Ignore me)bar/.match('foobar') # => #<MatchData "foobar">
1060
+ #
1061
+ # The comment may not include an unescaped terminator character.
1062
+ #
1063
+ # See also [Extended Mode](rdoc-ref:Regexp@Extended+Mode).
1064
+ #
1065
+ # ## Modes
1066
+ #
1067
+ # Each of these modifiers sets a mode for the regexp:
1068
+ #
1069
+ # * `i`: `/*pattern*/i` sets [Case-Insensitive
1070
+ # Mode](rdoc-ref:Regexp@Case-Insensitive+Mode).
1071
+ # * `m`: `/*pattern*/m` sets [Multiline Mode](rdoc-ref:Regexp@Multiline+Mode).
1072
+ # * `x`: `/*pattern*/x` sets [Extended Mode](rdoc-ref:Regexp@Extended+Mode).
1073
+ # * `o`: `/*pattern*/o` sets [Interpolation
1074
+ # Mode](rdoc-ref:Regexp@Interpolation+Mode).
1075
+ #
1076
+ #
1077
+ # Any, all, or none of these may be applied.
1078
+ #
1079
+ # Modifiers `i`, `m`, and `x` may be applied to subexpressions:
1080
+ #
1081
+ # * `(?*modifier*)` turns the mode "on" for ensuing subexpressions
1082
+ # * `(?-*modifier*)` turns the mode "off" for ensuing subexpressions
1083
+ # * `(?*modifier*:*subexp*)` turns the mode "on" for *subexp* within the group
1084
+ # * `(?-*modifier*:*subexp*)` turns the mode "off" for *subexp* within the
1085
+ # group
1086
+ #
1087
+ #
1088
+ # Example:
1089
+ #
1090
+ # re = /(?i)te(?-i)st/
1091
+ # re.match('test') # => #<MatchData "test">
1092
+ # re.match('TEst') # => #<MatchData "TEst">
1093
+ # re.match('TEST') # => nil
1094
+ # re.match('teST') # => nil
1095
+ #
1096
+ # re = /t(?i:e)st/
1097
+ # re.match('test') # => #<MatchData "test">
1098
+ # re.match('tEst') # => #<MatchData "tEst">
1099
+ # re.match('tEST') # => nil
1100
+ #
1101
+ # Method Regexp#options returns an integer whose value showing the settings for
1102
+ # case-insensitivity mode, multiline mode, and extended mode.
1103
+ #
1104
+ # ### Case-Insensitive Mode
1105
+ #
1106
+ # By default, a regexp is case-sensitive:
1107
+ #
1108
+ # /foo/.match('FOO') # => nil
1109
+ #
1110
+ # Modifier `i` enables case-insensitive mode:
1111
+ #
1112
+ # /foo/i.match('FOO')
1113
+ # # => #<MatchData "FOO">
1114
+ #
1115
+ # Method Regexp#casefold? returns whether the mode is case-insensitive.
1116
+ #
1117
+ # ### Multiline Mode
1118
+ #
1119
+ # The multiline-mode in Ruby is what is commonly called a "dot-all mode":
1120
+ #
1121
+ # * Without the `m` modifier, the subexpression `.` does not match newlines:
1122
+ #
1123
+ # /a.c/.match("a\nc") # => nil
1124
+ #
1125
+ # * With the modifier, it does match:
1126
+ #
1127
+ # /a.c/m.match("a\nc") # => #<MatchData "a\nc">
1128
+ #
1129
+ #
1130
+ # Unlike other languages, the modifier `m` does not affect the anchors `^` and
1131
+ # `$`. These anchors always match at line-boundaries in Ruby.
1132
+ #
1133
+ # ### Extended Mode
1134
+ #
1135
+ # Modifier `x` enables extended mode, which means that:
1136
+ #
1137
+ # * Literal white space in the pattern is to be ignored.
1138
+ # * Character `#` marks the remainder of its containing line as a comment,
1139
+ # which is also to be ignored for matching purposes.
1140
+ #
1141
+ #
1142
+ # In extended mode, whitespace and comments may be used to form a
1143
+ # self-documented regexp.
1144
+ #
1145
+ # Regexp not in extended mode (matches some Roman numerals):
1146
+ #
1147
+ # pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
1148
+ # re = /#{pattern}/
1149
+ # re.match('MCMXLIII') # => #<MatchData "MCMXLIII" 1:"CM" 2:"XL" 3:"III">
1150
+ #
1151
+ # Regexp in extended mode:
1152
+ #
1153
+ # pattern = <<-EOT
1154
+ # ^ # beginning of string
1155
+ # M{0,3} # thousands - 0 to 3 Ms
1156
+ # (CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 Cs),
1157
+ # # or 500-800 (D, followed by 0 to 3 Cs)
1158
+ # (XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 Xs),
1159
+ # # or 50-80 (L, followed by 0 to 3 Xs)
1160
+ # (IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 Is),
1161
+ # # or 5-8 (V, followed by 0 to 3 Is)
1162
+ # $ # end of string
1163
+ # EOT
1164
+ # re = /#{pattern}/x
1165
+ # re.match('MCMXLIII') # => #<MatchData "MCMXLIII" 1:"CM" 2:"XL" 3:"III">
1166
+ #
1167
+ # ### Interpolation Mode
1168
+ #
1169
+ # Modifier `o` means that the first time a literal regexp with interpolations is
1170
+ # encountered, the generated Regexp object is saved and used for all future
1171
+ # evaluations of that literal regexp. Without modifier `o`, the generated Regexp
1172
+ # is not saved, so each evaluation of the literal regexp generates a new Regexp
1173
+ # object.
1174
+ #
1175
+ # Without modifier `o`:
1176
+ #
1177
+ # def letters; sleep 5; /[A-Z][a-z]/; end
1178
+ # words = %w[abc def xyz]
1179
+ # start = Time.now
1180
+ # words.each {|word| word.match(/\A[#{letters}]+\z/) }
1181
+ # Time.now - start # => 15.0174892
1182
+ #
1183
+ # With modifier `o`:
1184
+ #
1185
+ # start = Time.now
1186
+ # words.each {|word| word.match(/\A[#{letters}]+\z/o) }
1187
+ # Time.now - start # => 5.0010866
1188
+ #
1189
+ # Note that if the literal regexp does not have interpolations, the `o` behavior
1190
+ # is the default.
1191
+ #
1192
+ # ## Encodings
1193
+ #
1194
+ # By default, a regexp with only US-ASCII characters has US-ASCII encoding:
1195
+ #
1196
+ # re = /foo/
1197
+ # re.source.encoding # => #<Encoding:US-ASCII>
1198
+ # re.encoding # => #<Encoding:US-ASCII>
1199
+ #
1200
+ # A regular expression containing non-US-ASCII characters is assumed to use the
1201
+ # source encoding. This can be overridden with one of the following modifiers.
1202
+ #
1203
+ # * `/*pat*/n`: US-ASCII if only containing US-ASCII characters, otherwise
1204
+ # ASCII-8BIT:
1205
+ #
1206
+ # /foo/n.encoding # => #<Encoding:US-ASCII>
1207
+ # /foo\xff/n.encoding # => #<Encoding:ASCII-8BIT>
1208
+ # /foo\x7f/n.encoding # => #<Encoding:US-ASCII>
1209
+ #
1210
+ # * `/*pat*/u`: UTF-8
1211
+ #
1212
+ # /foo/u.encoding # => #<Encoding:UTF-8>
1213
+ #
1214
+ # * `/*pat*/e`: EUC-JP
1215
+ #
1216
+ # /foo/e.encoding # => #<Encoding:EUC-JP>
1217
+ #
1218
+ # * `/*pat*/s`: Windows-31J
1219
+ #
1220
+ # /foo/s.encoding # => #<Encoding:Windows-31J>
1221
+ #
1222
+ #
1223
+ # A regexp can be matched against a target string when either:
1224
+ #
1225
+ # * They have the same encoding.
1226
+ # * The regexp's encoding is a fixed encoding and the string contains only
1227
+ # ASCII characters. Method Regexp#fixed_encoding? returns whether the regexp
1228
+ # has a *fixed* encoding.
1229
+ #
1230
+ #
1231
+ # If a match between incompatible encodings is attempted an
1232
+ # `Encoding::CompatibilityError` exception is raised.
676
1233
  #
677
1234
  # Example:
678
1235
  #
679
- # m = /s(\w{2}).*(c)/.match('haystack') #=> #<MatchData "stac" 1:"ta" 2:"c">
680
- # $~ #=> #<MatchData "stac" 1:"ta" 2:"c">
681
- # Regexp.last_match #=> #<MatchData "stac" 1:"ta" 2:"c">
1236
+ # re = eval("# encoding: ISO-8859-1\n/foo\\xff?/")
1237
+ # re.encoding # => #<Encoding:ISO-8859-1>
1238
+ # re =~ "foo".encode("UTF-8") # => 0
1239
+ # re =~ "foo\u0100" # Raises Encoding::CompatibilityError
1240
+ #
1241
+ # The encoding may be explicitly fixed by including Regexp::FIXEDENCODING in the
1242
+ # second argument for Regexp.new:
1243
+ #
1244
+ # # Regexp with encoding ISO-8859-1.
1245
+ # re = Regexp.new("a".force_encoding('iso-8859-1'), Regexp::FIXEDENCODING)
1246
+ # re.encoding # => #<Encoding:ISO-8859-1>
1247
+ # # Target string with encoding UTF-8.
1248
+ # s = "a\u3042"
1249
+ # s.encoding # => #<Encoding:UTF-8>
1250
+ # re.match(s) # Raises Encoding::CompatibilityError.
682
1251
  #
683
- # $& #=> "stac"
684
- # # same as m[0]
685
- # $` #=> "hay"
686
- # # same as m.pre_match
687
- # $' #=> "k"
688
- # # same as m.post_match
689
- # $1 #=> "ta"
690
- # # same as m[1]
691
- # $2 #=> "c"
692
- # # same as m[2]
693
- # $3 #=> nil
694
- # # no third group in pattern
695
- # $+ #=> "c"
696
- # # same as m[-1]
1252
+ # ## Timeouts
697
1253
  #
698
- # These global variables are thread-local and method-local variables.
1254
+ # When either a regexp source or a target string comes from untrusted input,
1255
+ # malicious values could become a denial-of-service attack; to prevent such an
1256
+ # attack, it is wise to set a timeout.
699
1257
  #
700
- # ## Performance
1258
+ # Regexp has two timeout values:
701
1259
  #
702
- # Certain pathological combinations of constructs can lead to abysmally bad
703
- # performance.
1260
+ # * A class default timeout, used for a regexp whose instance timeout is
1261
+ # `nil`; this default is initially `nil`, and may be set by method
1262
+ # Regexp.timeout=:
704
1263
  #
705
- # Consider a string of 25 *a*s, a *d*, 4 *a*s, and a *c*.
1264
+ # Regexp.timeout # => nil
1265
+ # Regexp.timeout = 3.0
1266
+ # Regexp.timeout # => 3.0
706
1267
  #
707
- # s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
708
- # #=> "aaaaaaaaaaaaaaaaaaaaaaaaadaaaac"
1268
+ # * An instance timeout, which defaults to `nil` and may be set in Regexp.new:
709
1269
  #
710
- # The following patterns match instantly as you would expect:
1270
+ # re = Regexp.new('foo', timeout: 5.0)
1271
+ # re.timeout # => 5.0
711
1272
  #
712
- # /(b|a)/ =~ s #=> 0
713
- # /(b|a+)/ =~ s #=> 0
714
- # /(b|a+)*/ =~ s #=> 0
715
1273
  #
716
- # However, the following pattern takes appreciably longer:
1274
+ # When regexp.timeout is `nil`, the timeout "falls through" to Regexp.timeout;
1275
+ # when regexp.timeout is non-`nil`, that value controls timing out:
717
1276
  #
718
- # /(b|a+)*c/ =~ s #=> 26
1277
+ # | regexp.timeout Value | Regexp.timeout Value | Result |
1278
+ # |----------------------|----------------------|-----------------------------|
1279
+ # | nil | nil | Never times out. |
1280
+ # | nil | Float | Times out in Float seconds. |
1281
+ # | Float | Any | Times out in Float seconds. |
719
1282
  #
720
- # This happens because an atom in the regexp is quantified by both an immediate
721
- # `+` and an enclosing `*` with nothing to differentiate which is in control of
722
- # any particular character. The nondeterminism that results produces
723
- # super-linear performance. (Consult *Mastering Regular Expressions* (3rd ed.),
724
- # pp 222, by *Jeffery Friedl*, for an in-depth analysis). This particular case
725
- # can be fixed by use of atomic grouping, which prevents the unnecessary
726
- # backtracking:
1283
+ # ## Optimization
727
1284
  #
728
- # (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start)
729
- # #=> 24.702736882
730
- # (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start)
731
- # #=> 0.000166571
1285
+ # For certain values of the pattern and target string, matching time can grow
1286
+ # polynomially or exponentially in relation to the input size; the potential
1287
+ # vulnerability arising from this is the [regular expression
1288
+ # denial-of-service](https://en.wikipedia.org/wiki/ReDoS) (ReDoS) attack.
732
1289
  #
733
- # A similar case is typified by the following example, which takes approximately
734
- # 60 seconds to execute for me:
1290
+ # Regexp matching can apply an optimization to prevent ReDoS attacks. When the
1291
+ # optimization is applied, matching time increases linearly (not polynomially or
1292
+ # exponentially) in relation to the input size, and a ReDoS attach is not
1293
+ # possible.
735
1294
  #
736
- # Match a string of 29 *a*s against a pattern of 29 optional *a*s followed by 29
737
- # mandatory *a*s:
1295
+ # This optimization is applied if the pattern meets these criteria:
738
1296
  #
739
- # Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29
1297
+ # * No backreferences.
1298
+ # * No subexpression calls.
1299
+ # * No nested lookaround anchors or atomic groups.
1300
+ # * No nested quantifiers with counting (i.e. no nested `{n}`, `{min,}`,
1301
+ # `{,max}`, or `{min,max}` style quantifiers)
740
1302
  #
741
- # The 29 optional *a*s match the string, but this prevents the 29 mandatory *a*s
742
- # that follow from matching. Ruby must then backtrack repeatedly so as to
743
- # satisfy as many of the optional matches as it can while still matching the
744
- # mandatory 29. It is plain to us that none of the optional matches can succeed,
745
- # but this fact unfortunately eludes Ruby.
746
1303
  #
747
- # The best way to improve performance is to significantly reduce the amount of
748
- # backtracking needed. For this case, instead of individually matching 29
749
- # optional *a*s, a range of optional *a*s can be matched all at once with
750
- # *a{0,29}*:
1304
+ # You can use method Regexp.linear_time? to determine whether a pattern meets
1305
+ # these criteria:
751
1306
  #
752
- # Regexp.new('a{0,29}' + 'a' * 29) =~ 'a' * 29
1307
+ # Regexp.linear_time?(/a*/) # => true
1308
+ # Regexp.linear_time?('a*') # => true
1309
+ # Regexp.linear_time?(/(a*)\1/) # => false
753
1310
  #
754
- # ## Timeout
1311
+ # However, an untrusted source may not be safe even if the method returns
1312
+ # `true`, because the optimization uses memoization (which may invoke large
1313
+ # memory consumption).
755
1314
  #
756
- # There are two APIs to set timeout. One is Regexp.timeout=, which is
757
- # process-global configuration of timeout for Regexp matching.
1315
+ # ## References
758
1316
  #
759
- # Regexp.timeout = 3
760
- # s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
761
- # /(b|a+)*c/ =~ s #=> This raises an exception in three seconds
1317
+ # Read (online PDF books):
762
1318
  #
763
- # The other is timeout keyword of Regexp.new.
1319
+ # * [Mastering Regular
1320
+ # Expressions](https://ia902508.us.archive.org/10/items/allitebooks-02/Maste
1321
+ # ring%20Regular%20Expressions%2C%203rd%20Edition.pdf) by Jeffrey E.F.
1322
+ # Friedl.
1323
+ # * [Regular Expressions
1324
+ # Cookbook](https://doc.lagout.org/programmation/Regular%20Expressions/Regul
1325
+ # ar%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Program
1326
+ # ming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012-
1327
+ # 09-06%5D.pdf) by Jan Goyvaerts & Steven Levithan.
764
1328
  #
765
- # re = Regexp.new("(b|a+)*c", timeout: 3)
766
- # s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
767
- # /(b|a+)*c/ =~ s #=> This raises an exception in three seconds
768
1329
  #
769
- # When using Regexps to process untrusted input, you should use the timeout
770
- # feature to avoid excessive backtracking. Otherwise, a malicious user can
771
- # provide input to Regexp causing Denial-of-Service attack. Note that the
772
- # timeout is not set by default because an appropriate limit highly depends on
773
- # an application requirement and context.
1330
+ # Explore, test (interactive online editor):
1331
+ #
1332
+ # * [Rubular](https://rubular.com/).
774
1333
  #
775
1334
  class Regexp
776
1335
  # <!--
@@ -792,7 +1351,7 @@ class Regexp
792
1351
  # Regexp.new('foo', 'i') # => /foo/i
793
1352
  # Regexp.new('foo', 'im') # => /foo/im
794
1353
  #
795
- # * The logical OR of one or more of the constants Regexp::EXTENDED,
1354
+ # * The bit-wise OR of one or more of the constants Regexp::EXTENDED,
796
1355
  # Regexp::IGNORECASE, Regexp::MULTILINE, and Regexp::NOENCODING:
797
1356
  #
798
1357
  # Regexp.new('foo', Regexp::IGNORECASE) # => /foo/i
@@ -803,6 +1362,7 @@ class Regexp
803
1362
  # Regexp.new('foo', flags) # => /foo/mix
804
1363
  #
805
1364
  # * `nil` or `false`, which is ignored.
1365
+ # * Any other truthy value, in which case the regexp will be case-insensitive.
806
1366
  #
807
1367
  #
808
1368
  # If optional keyword argument `timeout` is given, its float value overrides the
@@ -820,8 +1380,6 @@ class Regexp
820
1380
  # r3 = Regexp.new(r, timeout: 3.14) # => /foo/m
821
1381
  # r3.timeout # => 3.14
822
1382
  #
823
- # Regexp.compile is an alias for Regexp.new.
824
- #
825
1383
  def initialize: (String string, ?String | Integer | nil | false options, ?timeout: Float?) -> Object
826
1384
  | (Regexp regexp, ?timeout: Float?) -> void
827
1385
 
@@ -847,8 +1405,6 @@ class Regexp
847
1405
  # r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./
848
1406
  # r.match(s) # => #<MatchData "\\\\\\*\\?\\{\\}\\.">
849
1407
  #
850
- # Regexp.quote is an alias for Regexp.escape.
851
- #
852
1408
  def self.escape: (interned str) -> String
853
1409
 
854
1410
  # <!--
@@ -858,8 +1414,8 @@ class Regexp
858
1414
  # - Regexp.last_match(name) -> string or nil
859
1415
  # -->
860
1416
  # With no argument, returns the value of `$!`, which is the result of the most
861
- # recent pattern match (see [Regexp Global
862
- # Variables](rdoc-ref:Regexp@Regexp+Global+Variables)):
1417
+ # recent pattern match (see [Regexp global
1418
+ # variables](rdoc-ref:Regexp@Global+Variables)):
863
1419
  #
864
1420
  # /c(.)t/ =~ 'cat' # => 0
865
1421
  # Regexp.last_match # => #<MatchData "cat" 1:"a">
@@ -926,8 +1482,6 @@ class Regexp
926
1482
  # r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./
927
1483
  # r.match(s) # => #<MatchData "\\\\\\*\\?\\{\\}\\.">
928
1484
  #
929
- # Regexp.quote is an alias for Regexp.escape.
930
- #
931
1485
  def self.quote: (interned str) -> String
932
1486
 
933
1487
  # <!--
@@ -1019,8 +1573,6 @@ class Regexp
1019
1573
  # /foo/ == Regexp.new('food') # => false
1020
1574
  # /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false
1021
1575
  #
1022
- # Regexp#eql? is an alias for Regexp#==.
1023
- #
1024
1576
  def ==: (untyped other) -> bool
1025
1577
 
1026
1578
  # <!--
@@ -1048,8 +1600,8 @@ class Regexp
1048
1600
  # - regexp =~ string -> integer or nil
1049
1601
  # -->
1050
1602
  # Returns the integer index (in characters) of the first match for `self` and
1051
- # `string`, or `nil` if none; also sets the [rdoc-ref:Regexp Global
1052
- # Variables](rdoc-ref:Regexp@Regexp+Global+Variables):
1603
+ # `string`, or `nil` if none; also sets the [rdoc-ref:Regexp global
1604
+ # variables](rdoc-ref:Regexp@Global+Variables):
1053
1605
  #
1054
1606
  # /at/ =~ 'input data' # => 7
1055
1607
  # $~ # => #<MatchData "at">
@@ -1062,7 +1614,7 @@ class Regexp
1062
1614
  # * Is a regexp literal; see [Regexp
1063
1615
  # Literals](rdoc-ref:literals.rdoc@Regexp+Literals).
1064
1616
  # * Does not contain interpolations; see [Regexp
1065
- # Interpolation](rdoc-ref:Regexp@Regexp+Interpolation).
1617
+ # interpolation](rdoc-ref:Regexp@Interpolation+Mode).
1066
1618
  # * Is at the left of the expression.
1067
1619
  #
1068
1620
  #
@@ -1131,8 +1683,6 @@ class Regexp
1131
1683
  # /foo/ == Regexp.new('food') # => false
1132
1684
  # /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false
1133
1685
  #
1134
- # Regexp#eql? is an alias for Regexp#==.
1135
- #
1136
1686
  def eql?: (untyped other) -> bool
1137
1687
 
1138
1688
  # <!--
@@ -1296,8 +1846,8 @@ class Regexp
1296
1846
  # /foo/mix.options # => 7
1297
1847
  #
1298
1848
  # Note that additional bits may be set in the returned integer; these are
1299
- # maintained internally internally in `self`, are ignored if passed to
1300
- # Regexp.new, and may be ignored by the caller:
1849
+ # maintained internally in `self`, are ignored if passed to Regexp.new, and may
1850
+ # be ignored by the caller:
1301
1851
  #
1302
1852
  # Returns the set of bits corresponding to the options used when creating this
1303
1853
  # regexp (see Regexp::new for details). Note that additional bits may be set in
@@ -1339,7 +1889,8 @@ class Regexp
1339
1889
  # s0 = r0.to_s # => "(?ix-m:ab+c)"
1340
1890
  #
1341
1891
  # The returned string may be used as an argument to Regexp.new, or as
1342
- # interpolated text for a [Regexp literal](rdoc-ref:regexp.rdoc@Regexp+Literal):
1892
+ # interpolated text for a [Regexp
1893
+ # interpolation](rdoc-ref:Regexp@Interpolation+Mode):
1343
1894
  #
1344
1895
  # r1 = Regexp.new(s0) # => /(?ix-m:ab+c)/
1345
1896
  # r2 = /#{s0}/ # => /(?ix-m:ab+c)/