rbs 3.3.2 → 3.4.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/comments.yml +2 -5
- data/.github/workflows/ruby.yml +7 -8
- data/.github/workflows/typecheck.yml +37 -0
- data/CHANGELOG.md +50 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +11 -11
- data/Steepfile +2 -2
- data/core/array.rbs +19 -49
- data/core/basic_object.rbs +2 -2
- data/core/comparable.rbs +17 -8
- data/core/complex.rbs +82 -43
- data/core/data.rbs +2 -4
- data/core/dir.rbs +635 -295
- data/core/enumerable.rbs +11 -18
- data/core/enumerator.rbs +37 -31
- data/core/errors.rbs +4 -0
- data/core/false_class.rbs +34 -15
- data/core/fiber.rbs +23 -0
- data/core/file.rbs +329 -120
- data/core/float.rbs +17 -32
- data/core/gc.rbs +17 -11
- data/core/hash.rbs +22 -44
- data/core/integer.rbs +82 -113
- data/core/io/buffer.rbs +90 -47
- data/core/io.rbs +39 -116
- data/core/kernel.rbs +442 -489
- data/core/match_data.rbs +55 -56
- data/core/module.rbs +45 -1
- data/core/nil_class.rbs +98 -35
- data/core/numeric.rbs +22 -32
- data/core/object_space/weak_key_map.rbs +102 -0
- data/core/process.rbs +1242 -655
- data/core/ractor.rbs +139 -120
- data/core/range.rbs +100 -4
- data/core/rational.rbs +0 -4
- data/core/rbs/unnamed/argf.rbs +16 -8
- data/core/rbs/unnamed/env_class.rbs +0 -24
- data/core/refinement.rbs +8 -0
- data/core/regexp.rbs +1149 -598
- data/core/ruby_vm.rbs +126 -12
- data/core/rubygems/platform.rbs +9 -0
- data/core/rubygems/rubygems.rbs +1 -1
- data/core/rubygems/version.rbs +5 -1
- data/core/set.rbs +20 -22
- data/core/signal.rbs +4 -4
- data/core/string.rbs +283 -230
- data/core/string_io.rbs +2 -14
- data/core/struct.rbs +404 -24
- data/core/symbol.rbs +1 -19
- data/core/thread.rbs +29 -12
- data/core/time.rbs +227 -104
- data/core/trace_point.rbs +2 -5
- data/core/true_class.rbs +54 -21
- data/core/warning.rbs +14 -11
- data/docs/data_and_struct.md +29 -0
- data/docs/syntax.md +3 -5
- data/docs/tools.md +1 -0
- data/ext/rbs_extension/lexer.c +643 -559
- data/ext/rbs_extension/lexer.re +5 -1
- data/ext/rbs_extension/parser.c +12 -3
- data/ext/rbs_extension/unescape.c +7 -47
- data/lib/rbs/cli/diff.rb +4 -1
- data/lib/rbs/cli/validate.rb +280 -0
- data/lib/rbs/cli.rb +2 -194
- data/lib/rbs/collection/config.rb +5 -6
- data/lib/rbs/collection/sources/git.rb +1 -1
- data/lib/rbs/collection.rb +1 -0
- data/lib/rbs/diff.rb +7 -4
- data/lib/rbs/errors.rb +11 -0
- data/lib/rbs/test/errors.rb +4 -1
- data/lib/rbs/test/guaranteed.rb +2 -3
- data/lib/rbs/test/type_check.rb +15 -10
- data/lib/rbs/test.rb +3 -3
- data/lib/rbs/types.rb +29 -0
- data/lib/rbs/unit_test/convertibles.rb +176 -0
- data/lib/rbs/unit_test/spy.rb +136 -0
- data/lib/rbs/unit_test/type_assertions.rb +341 -0
- data/lib/rbs/unit_test/with_aliases.rb +143 -0
- data/lib/rbs/unit_test.rb +6 -0
- data/lib/rbs/version.rb +1 -1
- data/sig/cli/validate.rbs +43 -0
- data/sig/diff.rbs +3 -1
- data/sig/errors.rbs +8 -0
- data/sig/rbs.rbs +1 -1
- data/sig/test/errors.rbs +52 -0
- data/sig/test/guranteed.rbs +9 -0
- data/sig/test/type_check.rbs +19 -0
- data/sig/test.rbs +82 -0
- data/sig/types.rbs +6 -1
- data/sig/unit_test/convertibles.rbs +154 -0
- data/sig/unit_test/spy.rbs +28 -0
- data/sig/unit_test/type_assertions.rbs +194 -0
- data/sig/unit_test/with_aliases.rbs +136 -0
- data/stdlib/base64/0/base64.rbs +307 -45
- data/stdlib/bigdecimal/0/big_decimal.rbs +35 -15
- data/stdlib/coverage/0/coverage.rbs +2 -2
- data/stdlib/csv/0/csv.rbs +25 -55
- data/stdlib/date/0/date.rbs +1 -43
- data/stdlib/date/0/date_time.rbs +1 -13
- data/stdlib/delegate/0/delegator.rbs +186 -0
- data/stdlib/delegate/0/kernel.rbs +47 -0
- data/stdlib/delegate/0/simple_delegator.rbs +98 -0
- data/stdlib/did_you_mean/0/did_you_mean.rbs +1 -1
- data/stdlib/erb/0/erb.rbs +2 -2
- data/stdlib/fileutils/0/fileutils.rbs +0 -19
- data/stdlib/io-console/0/io-console.rbs +12 -1
- data/stdlib/ipaddr/0/ipaddr.rbs +2 -1
- data/stdlib/json/0/json.rbs +320 -81
- data/stdlib/logger/0/logger.rbs +9 -5
- data/stdlib/monitor/0/monitor.rbs +78 -0
- data/stdlib/net-http/0/net-http.rbs +1880 -543
- data/stdlib/objspace/0/objspace.rbs +19 -13
- data/stdlib/openssl/0/openssl.rbs +508 -127
- data/stdlib/optparse/0/optparse.rbs +25 -11
- data/stdlib/pathname/0/pathname.rbs +1 -1
- data/stdlib/pp/0/pp.rbs +2 -5
- data/stdlib/prettyprint/0/prettyprint.rbs +2 -2
- data/stdlib/pstore/0/pstore.rbs +2 -4
- data/stdlib/rdoc/0/comment.rbs +1 -2
- data/stdlib/resolv/0/resolv.rbs +4 -2
- data/stdlib/socket/0/socket.rbs +2 -2
- data/stdlib/socket/0/unix_socket.rbs +2 -2
- data/stdlib/strscan/0/string_scanner.rbs +3 -2
- data/stdlib/tempfile/0/tempfile.rbs +1 -1
- data/stdlib/uri/0/common.rbs +245 -123
- metadata +23 -4
- data/lib/rbs/test/spy.rb +0 -6
data/core/regexp.rbs
CHANGED
@@ -1,116 +1,271 @@
|
|
1
1
|
# <!-- rdoc-file=re.c -->
|
2
|
-
#
|
3
|
-
#
|
4
|
-
# extracting the portions that match. They are created with the `/`*pat*`/` and
|
5
|
-
# `%r{`*pat*`}` literals or the `Regexp.new` constructor.
|
2
|
+
# A [regular expression](https://en.wikipedia.org/wiki/Regular_expression) (also
|
3
|
+
# called a *regexp*) is a *match pattern* (also simply called a *pattern*).
|
6
4
|
#
|
7
|
-
# A
|
5
|
+
# A common notation for a regexp uses enclosing slash characters:
|
8
6
|
#
|
9
|
-
# /
|
10
|
-
# /y/.match('haystack') #=> #<MatchData "y">
|
7
|
+
# /foo/
|
11
8
|
#
|
12
|
-
#
|
13
|
-
# matches
|
9
|
+
# A regexp may be applied to a *target string*; The part of the string (if any)
|
10
|
+
# that matches the pattern is called a *match*, and may be said *to match*:
|
14
11
|
#
|
15
|
-
#
|
12
|
+
# re = /red/
|
13
|
+
# re.match?('redirect') # => true # Match at beginning of target.
|
14
|
+
# re.match?('bored') # => true # Match at end of target.
|
15
|
+
# re.match?('credit') # => true # Match within target.
|
16
|
+
# re.match?('foo') # => false # No match.
|
16
17
|
#
|
17
|
-
#
|
18
|
+
# ## Regexp Uses
|
18
19
|
#
|
19
|
-
#
|
20
|
+
# A regexp may be used:
|
20
21
|
#
|
21
|
-
#
|
22
|
+
# * To extract substrings based on a given pattern:
|
22
23
|
#
|
23
|
-
#
|
24
|
-
#
|
24
|
+
# re = /foo/ # => /foo/
|
25
|
+
# re.match('food') # => #<MatchData "foo">
|
26
|
+
# re.match('good') # => nil
|
25
27
|
#
|
26
|
-
#
|
27
|
-
#
|
28
|
+
# See sections [Method match](rdoc-ref:Regexp@Method+match) and [Operator
|
29
|
+
# =~](rdoc-ref:Regexp@Operator+-3D~).
|
28
30
|
#
|
29
|
-
#
|
31
|
+
# * To determine whether a string matches a given pattern:
|
30
32
|
#
|
31
|
-
#
|
33
|
+
# re.match?('food') # => true
|
34
|
+
# re.match?('good') # => false
|
32
35
|
#
|
33
|
-
#
|
34
|
-
# /#{foo}/ # => /bar/
|
36
|
+
# See section [Method match?](rdoc-ref:Regexp@Method+match-3F).
|
35
37
|
#
|
36
|
-
#
|
38
|
+
# * As an argument for calls to certain methods in other classes and modules;
|
39
|
+
# most such methods accept an argument that may be either a string or the
|
40
|
+
# (much more powerful) regexp.
|
37
41
|
#
|
38
|
-
#
|
39
|
-
# method.
|
42
|
+
# See [Regexp Methods](rdoc-ref:regexp/methods.rdoc).
|
40
43
|
#
|
41
|
-
# ### `=~` Operator
|
42
44
|
#
|
43
|
-
#
|
44
|
-
# expression and the other is a string then the regular expression is used as a
|
45
|
-
# pattern to match against the string. (This operator is equivalently defined
|
46
|
-
# by Regexp and String so the order of String and Regexp do not matter. Other
|
47
|
-
# classes may have different implementations of `=~`.) If a match is found, the
|
48
|
-
# operator returns index of first match in string, otherwise it returns `nil`.
|
45
|
+
# ## Regexp Objects
|
49
46
|
#
|
50
|
-
#
|
51
|
-
# 'haystack' =~ /hay/ #=> 0
|
52
|
-
# /a/ =~ 'haystack' #=> 1
|
53
|
-
# /u/ =~ 'haystack' #=> nil
|
47
|
+
# A regexp object has:
|
54
48
|
#
|
55
|
-
#
|
56
|
-
# after a successful match. `$~` holds a MatchData object. Regexp.last_match is
|
57
|
-
# equivalent to `$~`.
|
49
|
+
# * A source; see [Sources](rdoc-ref:Regexp@Sources).
|
58
50
|
#
|
59
|
-
#
|
51
|
+
# * Several modes; see [Modes](rdoc-ref:Regexp@Modes).
|
60
52
|
#
|
61
|
-
#
|
53
|
+
# * A timeout; see [Timeouts](rdoc-ref:Regexp@Timeouts).
|
62
54
|
#
|
63
|
-
#
|
55
|
+
# * An encoding; see [Encodings](rdoc-ref:Regexp@Encodings).
|
64
56
|
#
|
65
|
-
# ## Metacharacters and Escapes
|
66
57
|
#
|
67
|
-
#
|
68
|
-
# `+`, `*`. They have a specific meaning when appearing in a pattern. To match
|
69
|
-
# them literally they must be backslash-escaped. To match a backslash literally,
|
70
|
-
# backslash-escape it: `\\\`.
|
58
|
+
# ## Creating a Regexp
|
71
59
|
#
|
72
|
-
#
|
73
|
-
# /a\\\\b/.match('a\\\\b') #=> #<MatchData "a\\b">
|
60
|
+
# A regular expression may be created with:
|
74
61
|
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
# [below](#label-Character+Classes)).
|
62
|
+
# * A regexp literal using slash characters (see [Regexp
|
63
|
+
# Literals](rdoc-ref:syntax/literals.rdoc@Regexp+Literals)):
|
78
64
|
#
|
79
|
-
#
|
80
|
-
#
|
65
|
+
# # This is a very common usage.
|
66
|
+
# /foo/ # => /foo/
|
81
67
|
#
|
82
|
-
#
|
83
|
-
#
|
68
|
+
# * A `%r` regexp literal (see [%r: Regexp
|
69
|
+
# Literals](rdoc-ref:syntax/literals.rdoc@25r-3A+Regexp+Literals)):
|
84
70
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
71
|
+
# # Same delimiter character at beginning and end;
|
72
|
+
# # useful for avoiding escaping characters
|
73
|
+
# %r/name\/value pair/ # => /name\/value pair/
|
74
|
+
# %r:name/value pair: # => /name\/value pair/
|
75
|
+
# %r|name/value pair| # => /name\/value pair/
|
88
76
|
#
|
89
|
-
#
|
77
|
+
# # Certain "paired" characters can be delimiters.
|
78
|
+
# %r[foo] # => /foo/
|
79
|
+
# %r{foo} # => /foo/
|
80
|
+
# %r(foo) # => /foo/
|
81
|
+
# %r<foo> # => /foo/
|
90
82
|
#
|
91
|
-
#
|
92
|
-
# characters that may appear at that point in the match. `/[ab]/` means *a* or
|
93
|
-
# *b*, as opposed to `/ab/` which means *a* followed by *b*.
|
83
|
+
# * Method Regexp.new.
|
94
84
|
#
|
95
|
-
# /W[aeiou]rd/.match("Word") #=> #<MatchData "Word">
|
96
85
|
#
|
97
|
-
#
|
98
|
-
# inclusive range of characters. `[abcd]` is equivalent to `[a-d]`. A range can
|
99
|
-
# be followed by another range, so `[abcdwxyz]` is equivalent to `[a-dw-z]`. The
|
100
|
-
# order in which ranges or individual characters appear inside a character class
|
101
|
-
# is irrelevant.
|
86
|
+
# ## Method `match`
|
102
87
|
#
|
103
|
-
#
|
104
|
-
#
|
88
|
+
# Each of the methods Regexp#match, String#match, and Symbol#match returns a
|
89
|
+
# MatchData object if a match was found, `nil` otherwise; each also sets [global
|
90
|
+
# variables](rdoc-ref:Regexp@Global+Variables):
|
105
91
|
#
|
106
|
-
#
|
107
|
-
#
|
92
|
+
# 'food'.match(/foo/) # => #<MatchData "foo">
|
93
|
+
# 'food'.match(/bar/) # => nil
|
108
94
|
#
|
109
|
-
#
|
95
|
+
# ## Operator `=~`
|
96
|
+
#
|
97
|
+
# Each of the operators Regexp#=~, String#=~, and Symbol#=~ returns an integer
|
98
|
+
# offset if a match was found, `nil` otherwise; each also sets [global
|
99
|
+
# variables](rdoc-ref:Regexp@Global+Variables):
|
100
|
+
#
|
101
|
+
# /bar/ =~ 'foo bar' # => 4
|
102
|
+
# 'foo bar' =~ /bar/ # => 4
|
103
|
+
# /baz/ =~ 'foo bar' # => nil
|
104
|
+
#
|
105
|
+
# ## Method `match?`
|
106
|
+
#
|
107
|
+
# Each of the methods Regexp#match?, String#match?, and Symbol#match? returns
|
108
|
+
# `true` if a match was found, `false` otherwise; none sets [global
|
109
|
+
# variables](rdoc-ref:Regexp@Global+Variables):
|
110
|
+
#
|
111
|
+
# 'food'.match?(/foo/) # => true
|
112
|
+
# 'food'.match?(/bar/) # => false
|
113
|
+
#
|
114
|
+
# ## Global Variables
|
115
|
+
#
|
116
|
+
# Certain regexp-oriented methods assign values to global variables:
|
117
|
+
#
|
118
|
+
# * `#match`: see [Method match](rdoc-ref:Regexp@Method+match).
|
119
|
+
# * `#=~`: see [Operator =~](rdoc-ref:Regexp@Operator+-3D~).
|
120
|
+
#
|
121
|
+
#
|
122
|
+
# The affected global variables are:
|
123
|
+
#
|
124
|
+
# * `$~`: Returns a MatchData object, or `nil`.
|
125
|
+
# * `$&`: Returns the matched part of the string, or `nil`.
|
126
|
+
# * `$``: Returns the part of the string to the left of the match, or `nil`.
|
127
|
+
# * `$'`: Returns the part of the string to the right of the match, or `nil`.
|
128
|
+
# * `$+`: Returns the last group matched, or `nil`.
|
129
|
+
# * `$1`, `$2`, etc.: Returns the first, second, etc., matched group, or
|
130
|
+
# `nil`. Note that `$0` is quite different; it returns the name of the
|
131
|
+
# currently executing program.
|
132
|
+
#
|
133
|
+
#
|
134
|
+
# Examples:
|
135
|
+
#
|
136
|
+
# # Matched string, but no matched groups.
|
137
|
+
# 'foo bar bar baz'.match('bar')
|
138
|
+
# $~ # => #<MatchData "bar">
|
139
|
+
# $& # => "bar"
|
140
|
+
# $` # => "foo "
|
141
|
+
# $' # => " bar baz"
|
142
|
+
# $+ # => nil
|
143
|
+
# $1 # => nil
|
144
|
+
#
|
145
|
+
# # Matched groups.
|
146
|
+
# /s(\w{2}).*(c)/.match('haystack')
|
147
|
+
# $~ # => #<MatchData "stac" 1:"ta" 2:"c">
|
148
|
+
# $& # => "stac"
|
149
|
+
# $` # => "hay"
|
150
|
+
# $' # => "k"
|
151
|
+
# $+ # => "c"
|
152
|
+
# $1 # => "ta"
|
153
|
+
# $2 # => "c"
|
154
|
+
# $3 # => nil
|
155
|
+
#
|
156
|
+
# # No match.
|
157
|
+
# 'foo'.match('bar')
|
158
|
+
# $~ # => nil
|
159
|
+
# $& # => nil
|
160
|
+
# $` # => nil
|
161
|
+
# $' # => nil
|
162
|
+
# $+ # => nil
|
163
|
+
# $1 # => nil
|
164
|
+
#
|
165
|
+
# Note that Regexp#match?, String#match?, and Symbol#match? do not set global
|
166
|
+
# variables.
|
167
|
+
#
|
168
|
+
# ## Sources
|
169
|
+
#
|
170
|
+
# As seen above, the simplest regexp uses a literal expression as its source:
|
171
|
+
#
|
172
|
+
# re = /foo/ # => /foo/
|
173
|
+
# re.match('food') # => #<MatchData "foo">
|
174
|
+
# re.match('good') # => nil
|
175
|
+
#
|
176
|
+
# A rich collection of available *subexpressions* gives the regexp great power
|
177
|
+
# and flexibility:
|
178
|
+
#
|
179
|
+
# * [Special characters](rdoc-ref:Regexp@Special+Characters)
|
180
|
+
# * [Source literals](rdoc-ref:Regexp@Source+Literals)
|
181
|
+
# * [Character classes](rdoc-ref:Regexp@Character+Classes)
|
182
|
+
# * [Shorthand character classes](rdoc-ref:Regexp@Shorthand+Character+Classes)
|
183
|
+
# * [Anchors](rdoc-ref:Regexp@Anchors)
|
184
|
+
# * [Alternation](rdoc-ref:Regexp@Alternation)
|
185
|
+
# * [Quantifiers](rdoc-ref:Regexp@Quantifiers)
|
186
|
+
# * [Groups and captures](rdoc-ref:Regexp@Groups+and+Captures)
|
187
|
+
# * [Unicode](rdoc-ref:Regexp@Unicode)
|
188
|
+
# * [POSIX Bracket Expressions](rdoc-ref:Regexp@POSIX+Bracket+Expressions)
|
189
|
+
# * [Comments](rdoc-ref:Regexp@Comments)
|
190
|
+
#
|
191
|
+
#
|
192
|
+
# ### Special Characters
|
193
|
+
#
|
194
|
+
# Regexp special characters, called *metacharacters*, have special meanings in
|
195
|
+
# certain contexts; depending on the context, these are sometimes
|
196
|
+
# metacharacters:
|
197
|
+
#
|
198
|
+
# . ? - + * ^ \ | $ ( ) [ ] { }
|
199
|
+
#
|
200
|
+
# To match a metacharacter literally, backslash-escape it:
|
201
|
+
#
|
202
|
+
# # Matches one or more 'o' characters.
|
203
|
+
# /o+/.match('foo') # => #<MatchData "oo">
|
204
|
+
# # Would match 'o+'.
|
205
|
+
# /o\+/.match('foo') # => nil
|
206
|
+
#
|
207
|
+
# To match a backslash literally, backslash-escape it:
|
208
|
+
#
|
209
|
+
# /\./.match('\.') # => #<MatchData ".">
|
210
|
+
# /\\./.match('\.') # => #<MatchData "\\.">
|
211
|
+
#
|
212
|
+
# Method Regexp.escape returns an escaped string:
|
213
|
+
#
|
214
|
+
# Regexp.escape('.?-+*^\|$()[]{}')
|
215
|
+
# # => "\\.\\?\\-\\+\\*\\^\\\\\\|\\$\\(\\)\\[\\]\\{\\}"
|
216
|
+
#
|
217
|
+
# ### Source Literals
|
218
|
+
#
|
219
|
+
# The source literal largely behaves like a double-quoted string; see [String
|
220
|
+
# Literals](rdoc-ref:syntax/literals.rdoc@String+Literals).
|
221
|
+
#
|
222
|
+
# In particular, a source literal may contain interpolated expressions:
|
223
|
+
#
|
224
|
+
# s = 'foo' # => "foo"
|
225
|
+
# /#{s}/ # => /foo/
|
226
|
+
# /#{s.capitalize}/ # => /Foo/
|
227
|
+
# /#{2 + 2}/ # => /4/
|
228
|
+
#
|
229
|
+
# There are differences between an ordinary string literal and a source literal;
|
230
|
+
# see [Shorthand Character
|
231
|
+
# Classes](rdoc-ref:Regexp@Shorthand+Character+Classes).
|
232
|
+
#
|
233
|
+
# * `\s` in an ordinary string literal is equivalent to a space character; in
|
234
|
+
# a source literal, it's shorthand for matching a whitespace character.
|
235
|
+
# * In an ordinary string literal, these are (needlessly) escaped characters;
|
236
|
+
# in a source literal, they are shorthands for various matching characters:
|
237
|
+
#
|
238
|
+
# \w \W \d \D \h \H \S \R
|
239
|
+
#
|
240
|
+
#
|
241
|
+
# ### Character Classes
|
242
|
+
#
|
243
|
+
# A *character class* is delimited by square brackets; it specifies that certain
|
244
|
+
# characters match at a given point in the target string:
|
245
|
+
#
|
246
|
+
# # This character class will match any vowel.
|
247
|
+
# re = /B[aeiou]rd/
|
248
|
+
# re.match('Bird') # => #<MatchData "Bird">
|
249
|
+
# re.match('Bard') # => #<MatchData "Bard">
|
250
|
+
# re.match('Byrd') # => nil
|
251
|
+
#
|
252
|
+
# A character class may contain hyphen characters to specify ranges of
|
253
|
+
# characters:
|
254
|
+
#
|
255
|
+
# # These regexps have the same effect.
|
256
|
+
# /[abcdef]/.match('foo') # => #<MatchData "f">
|
257
|
+
# /[a-f]/.match('foo') # => #<MatchData "f">
|
258
|
+
# /[a-cd-f]/.match('foo') # => #<MatchData "f">
|
259
|
+
#
|
260
|
+
# When the first character of a character class is a caret (`^`), the sense of
|
261
|
+
# the class is inverted: it matches any character *except* those specified.
|
262
|
+
#
|
263
|
+
# /[^a-eg-z]/.match('f') # => #<MatchData "f">
|
110
264
|
#
|
111
265
|
# A character class may contain another character class. By itself this isn't
|
112
|
-
# useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`.
|
113
|
-
#
|
266
|
+
# useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`.
|
267
|
+
#
|
268
|
+
# However, character classes also support the `&&` operator, which performs set
|
114
269
|
# intersection on its arguments. The two can be combined as follows:
|
115
270
|
#
|
116
271
|
# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z))
|
@@ -119,238 +274,481 @@
|
|
119
274
|
#
|
120
275
|
# /[abh-w]/
|
121
276
|
#
|
122
|
-
#
|
123
|
-
#
|
124
|
-
# * `/./` - Any character except a newline.
|
125
|
-
# * `/./m` - Any character (the `m` modifier enables multiline mode)
|
126
|
-
# * `/\w/` - A word character (`[a-zA-Z0-9_]`)
|
127
|
-
# * `/\W/` - A non-word character (`[^a-zA-Z0-9_]`). Please take a look at
|
128
|
-
# [Bug #4044](https://bugs.ruby-lang.org/issues/4044) if using `/\W/` with
|
129
|
-
# the `/i` modifier.
|
130
|
-
# * `/\d/` - A digit character (`[0-9]`)
|
131
|
-
# * `/\D/` - A non-digit character (`[^0-9]`)
|
132
|
-
# * `/\h/` - A hexdigit character (`[0-9a-fA-F]`)
|
133
|
-
# * `/\H/` - A non-hexdigit character (`[^0-9a-fA-F]`)
|
134
|
-
# * `/\s/` - A whitespace character: `/[ \t\r\n\f\v]/`
|
135
|
-
# * `/\S/` - A non-whitespace character: `/[^ \t\r\n\f\v]/`
|
136
|
-
# * `/\R/` - A linebreak: `\n`, `\v`, `\f`, `\r` `\u0085` (NEXT LINE),
|
137
|
-
# `\u2028` (LINE SEPARATOR), `\u2029` (PARAGRAPH SEPARATOR) or `\r\n`.
|
138
|
-
#
|
139
|
-
#
|
140
|
-
# POSIX *bracket expressions* are also similar to character classes. They
|
141
|
-
# provide a portable alternative to the above, with the added benefit that they
|
142
|
-
# encompass non-ASCII characters. For instance, `/\d/` matches only the ASCII
|
143
|
-
# decimal digits (0-9); whereas `/[[:digit:]]/` matches any character in the
|
144
|
-
# Unicode *Nd* category.
|
145
|
-
#
|
146
|
-
# * `/[[:alnum:]]/` - Alphabetic and numeric character
|
147
|
-
# * `/[[:alpha:]]/` - Alphabetic character
|
148
|
-
# * `/[[:blank:]]/` - Space or tab
|
149
|
-
# * `/[[:cntrl:]]/` - Control character
|
150
|
-
# * `/[[:digit:]]/` - Digit
|
151
|
-
# * `/[[:graph:]]/` - Non-blank character (excludes spaces, control
|
152
|
-
# characters, and similar)
|
153
|
-
# * `/[[:lower:]]/` - Lowercase alphabetical character
|
154
|
-
# * `/[[:print:]]/` - Like [:graph:], but includes the space character
|
155
|
-
# * `/[[:punct:]]/` - Punctuation character
|
156
|
-
# * `/[[:space:]]/` - Whitespace character (`[:blank:]`, newline, carriage
|
157
|
-
# return, etc.)
|
158
|
-
# * `/[[:upper:]]/` - Uppercase alphabetical
|
159
|
-
# * `/[[:xdigit:]]/` - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
|
277
|
+
# ### Shorthand Character Classes
|
160
278
|
#
|
279
|
+
# Each of the following metacharacters serves as a shorthand for a character
|
280
|
+
# class:
|
161
281
|
#
|
162
|
-
#
|
282
|
+
# * `/./`: Matches any character except a newline:
|
163
283
|
#
|
164
|
-
#
|
165
|
-
#
|
166
|
-
# * `/[[:ascii:]]/` - A character in the ASCII character set
|
284
|
+
# /./.match('foo') # => #<MatchData "f">
|
285
|
+
# /./.match("\n") # => nil
|
167
286
|
#
|
168
|
-
#
|
169
|
-
#
|
170
|
-
# /[[:upper:]][[:lower:]]/.match("Hello") #=> #<MatchData "He">
|
171
|
-
# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> #<MatchData "A6">
|
287
|
+
# * `/./m`: Matches any character, including a newline; see [Multiline
|
288
|
+
# Mode](rdoc-ref:Regexp@Multiline+Mode):
|
172
289
|
#
|
290
|
+
# /./m.match("\n") # => #<MatchData "\n">
|
173
291
|
#
|
174
|
-
#
|
292
|
+
# * `/\w/`: Matches a word character: equivalent to `[a-zA-Z0-9_]`:
|
175
293
|
#
|
176
|
-
#
|
177
|
-
#
|
178
|
-
#
|
294
|
+
# /\w/.match(' foo') # => #<MatchData "f">
|
295
|
+
# /\w/.match(' _') # => #<MatchData "_">
|
296
|
+
# /\w/.match(' ') # => nil
|
179
297
|
#
|
180
|
-
# *
|
181
|
-
# * `+` - One or more times
|
182
|
-
# * `?` - Zero or one times (optional)
|
183
|
-
# * `{`*n*`}` - Exactly *n* times
|
184
|
-
# * `{`*n*`,}` - *n* or more times
|
185
|
-
# * `{,`*m*`}` - *m* or less times
|
186
|
-
# * `{`*n*`,`*m*`}` - At least *n* and at most *m* times
|
298
|
+
# * `/\W/`: Matches a non-word character: equivalent to `[^a-zA-Z0-9_]`:
|
187
299
|
#
|
300
|
+
# /\W/.match(' ') # => #<MatchData " ">
|
301
|
+
# /\W/.match('_') # => nil
|
188
302
|
#
|
189
|
-
#
|
190
|
-
# ('e'), two 'l' characters, then one 'o':
|
303
|
+
# * `/\d/`: Matches a digit character: equivalent to `[0-9]`:
|
191
304
|
#
|
192
|
-
#
|
305
|
+
# /\d/.match('THX1138') # => #<MatchData "1">
|
306
|
+
# /\d/.match('foo') # => nil
|
193
307
|
#
|
194
|
-
#
|
308
|
+
# * `/\D/`: Matches a non-digit character: equivalent to `[^0-9]`:
|
195
309
|
#
|
196
|
-
#
|
197
|
-
#
|
198
|
-
# matching makes the minimal amount of matches necessary for overall success.
|
199
|
-
# Most greedy metacharacters can be made lazy by following them with `?`. For
|
200
|
-
# the `{n}` pattern, because it specifies an exact number of characters to match
|
201
|
-
# and not a variable number of characters, the `?` metacharacter instead makes
|
202
|
-
# the repeated pattern optional.
|
310
|
+
# /\D/.match('123Jump!') # => #<MatchData "J">
|
311
|
+
# /\D/.match('123') # => nil
|
203
312
|
#
|
204
|
-
#
|
205
|
-
# '.+' matches '<a><b>'; the second uses a lazy quantifier so '.+?' matches
|
206
|
-
# '<a>':
|
313
|
+
# * `/\h/`: Matches a hexdigit character: equivalent to `[0-9a-fA-F]`:
|
207
314
|
#
|
208
|
-
#
|
209
|
-
#
|
315
|
+
# /\h/.match('xyz fedcba9876543210') # => #<MatchData "f">
|
316
|
+
# /\h/.match('xyz') # => nil
|
210
317
|
#
|
211
|
-
#
|
318
|
+
# * `/\H/`: Matches a non-hexdigit character: equivalent to `[^0-9a-fA-F]`:
|
212
319
|
#
|
213
|
-
#
|
214
|
-
#
|
215
|
-
# they refuse to "give up" their match even if this jeopardises the overall
|
216
|
-
# match.
|
320
|
+
# /\H/.match('fedcba9876543210xyz') # => #<MatchData "x">
|
321
|
+
# /\H/.match('fedcba9876543210') # => nil
|
217
322
|
#
|
218
|
-
#
|
219
|
-
# /<.*+><.+>/.match("<a><b>") #=> nil
|
220
|
-
# /<.*><.++>/.match("<a><b>") #=> nil
|
323
|
+
# * `/\s/`: Matches a whitespace character: equivalent to `/[ \t\r\n\f\v]/`:
|
221
324
|
#
|
222
|
-
#
|
325
|
+
# /\s/.match('foo bar') # => #<MatchData " ">
|
326
|
+
# /\s/.match('foo') # => nil
|
223
327
|
#
|
224
|
-
#
|
225
|
-
#
|
226
|
-
# the *backreference* `\n` (e.g. `\1`); outside of the pattern use
|
227
|
-
# `MatchData[n]` (e.g. `MatchData[1]`).
|
328
|
+
# * `/\S/`: Matches a non-whitespace character: equivalent to `/[^
|
329
|
+
# \t\r\n\f\v]/`:
|
228
330
|
#
|
229
|
-
#
|
230
|
-
#
|
331
|
+
# /\S/.match(" \t\r\n\f\v foo") # => #<MatchData "f">
|
332
|
+
# /\S/.match(" \t\r\n\f\v") # => nil
|
231
333
|
#
|
232
|
-
#
|
233
|
-
# #=> #<MatchData "cat sat in" 1:"at">
|
334
|
+
# * `/\R/`: Matches a linebreak, platform-independently:
|
234
335
|
#
|
235
|
-
#
|
236
|
-
#
|
336
|
+
# /\R/.match("\r") # => #<MatchData "\r"> # Carriage return (CR)
|
337
|
+
# /\R/.match("\n") # => #<MatchData "\n"> # Newline (LF)
|
338
|
+
# /\R/.match("\f") # => #<MatchData "\f"> # Formfeed (FF)
|
339
|
+
# /\R/.match("\v") # => #<MatchData "\v"> # Vertical tab (VT)
|
340
|
+
# /\R/.match("\r\n") # => #<MatchData "\r\n"> # CRLF
|
341
|
+
# /\R/.match("\u0085") # => #<MatchData "\u0085"> # Next line (NEL)
|
342
|
+
# /\R/.match("\u2028") # => #<MatchData "\u2028"> # Line separator (LSEP)
|
343
|
+
# /\R/.match("\u2029") # => #<MatchData "\u2029"> # Paragraph separator (PSEP)
|
237
344
|
#
|
238
|
-
# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at'
|
239
345
|
#
|
240
|
-
#
|
241
|
-
# groups 1-9 are supported using the `\n` backreference syntax.
|
346
|
+
# ### Anchors
|
242
347
|
#
|
243
|
-
#
|
244
|
-
#
|
245
|
-
# the `\0` backreference cannot be used inside the regexp, as backreferences can
|
246
|
-
# only be used after the end of the capture group, and the `\0` backreference
|
247
|
-
# uses the implicit capture group of the entire match. However, you can use
|
248
|
-
# this backreference when doing substitution:
|
348
|
+
# An anchor is a metasequence that matches a zero-width position between
|
349
|
+
# characters in the target string.
|
249
350
|
#
|
250
|
-
#
|
251
|
-
#
|
351
|
+
# For a subexpression with no anchor, matching may begin anywhere in the target
|
352
|
+
# string:
|
252
353
|
#
|
253
|
-
#
|
354
|
+
# /real/.match('surrealist') # => #<MatchData "real">
|
254
355
|
#
|
255
|
-
#
|
256
|
-
# `(?<`*name*`>)` or `(?'`*name*`')` constructs.
|
356
|
+
# For a subexpression with an anchor, matching must begin at the matched anchor.
|
257
357
|
#
|
258
|
-
#
|
259
|
-
# #=> #<MatchData "$3.67" dollars:"3" cents:"67">
|
260
|
-
# /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")[:dollars] #=> "3"
|
358
|
+
# #### Boundary Anchors
|
261
359
|
#
|
262
|
-
#
|
263
|
-
# group name.
|
360
|
+
# Each of these anchors matches a boundary:
|
264
361
|
#
|
265
|
-
#
|
266
|
-
#
|
362
|
+
# * `^`: Matches the beginning of a line:
|
363
|
+
#
|
364
|
+
# /^bar/.match("foo\nbar") # => #<MatchData "bar">
|
365
|
+
# /^ar/.match("foo\nbar") # => nil
|
366
|
+
#
|
367
|
+
# * `$`: Matches the end of a line:
|
368
|
+
#
|
369
|
+
# /bar$/.match("foo\nbar") # => #<MatchData "bar">
|
370
|
+
# /ba$/.match("foo\nbar") # => nil
|
371
|
+
#
|
372
|
+
# * `\A`: Matches the beginning of the string:
|
373
|
+
#
|
374
|
+
# /\Afoo/.match('foo bar') # => #<MatchData "foo">
|
375
|
+
# /\Afoo/.match(' foo bar') # => nil
|
376
|
+
#
|
377
|
+
# * `\Z`: Matches the end of the string; if string ends with a single newline,
|
378
|
+
# it matches just before the ending newline:
|
379
|
+
#
|
380
|
+
# /foo\Z/.match('bar foo') # => #<MatchData "foo">
|
381
|
+
# /foo\Z/.match('foo bar') # => nil
|
382
|
+
# /foo\Z/.match("bar foo\n") # => #<MatchData "foo">
|
383
|
+
# /foo\Z/.match("bar foo\n\n") # => nil
|
384
|
+
#
|
385
|
+
# * `\z`: Matches the end of the string:
|
386
|
+
#
|
387
|
+
# /foo\z/.match('bar foo') # => #<MatchData "foo">
|
388
|
+
# /foo\z/.match('foo bar') # => nil
|
389
|
+
# /foo\z/.match("bar foo\n") # => nil
|
390
|
+
#
|
391
|
+
# * `\b`: Matches word boundary when not inside brackets; matches backspace
|
392
|
+
# (`"0x08"`) when inside brackets:
|
393
|
+
#
|
394
|
+
# /foo\b/.match('foo bar') # => #<MatchData "foo">
|
395
|
+
# /foo\b/.match('foobar') # => nil
|
396
|
+
#
|
397
|
+
# * `\B`: Matches non-word boundary:
|
398
|
+
#
|
399
|
+
# /foo\B/.match('foobar') # => #<MatchData "foo">
|
400
|
+
# /foo\B/.match('foo bar') # => nil
|
401
|
+
#
|
402
|
+
# * `\G`: Matches first matching position:
|
403
|
+
#
|
404
|
+
# In methods like String#gsub and String#scan, it changes on each iteration.
|
405
|
+
# It initially matches the beginning of subject, and in each following
|
406
|
+
# iteration it matches where the last match finished.
|
407
|
+
#
|
408
|
+
# " a b c".gsub(/ /, '_') # => "____a_b_c"
|
409
|
+
# " a b c".gsub(/\G /, '_') # => "____a b c"
|
410
|
+
#
|
411
|
+
# In methods like Regexp#match and String#match that take an optional
|
412
|
+
# offset, it matches where the search begins.
|
413
|
+
#
|
414
|
+
# "hello, world".match(/,/, 3) # => #<MatchData ",">
|
415
|
+
# "hello, world".match(/\G,/, 3) # => nil
|
416
|
+
#
|
417
|
+
#
|
418
|
+
# #### Lookaround Anchors
|
419
|
+
#
|
420
|
+
# Lookahead anchors:
|
421
|
+
#
|
422
|
+
# * `(?=*pat*)`: Positive lookahead assertion: ensures that the following
|
423
|
+
# characters match *pat*, but doesn't include those characters in the
|
424
|
+
# matched substring.
|
425
|
+
#
|
426
|
+
# * `(?!*pat*)`: Negative lookahead assertion: ensures that the following
|
427
|
+
# characters *do not* match *pat*, but doesn't include those characters in
|
428
|
+
# the matched substring.
|
429
|
+
#
|
430
|
+
#
|
431
|
+
# Lookbehind anchors:
|
432
|
+
#
|
433
|
+
# * `(?<=*pat*)`: Positive lookbehind assertion: ensures that the preceding
|
434
|
+
# characters match *pat*, but doesn't include those characters in the
|
435
|
+
# matched substring.
|
436
|
+
#
|
437
|
+
# * `(?<!*pat*)`: Negative lookbehind assertion: ensures that the preceding
|
438
|
+
# characters do not match *pat*, but doesn't include those characters in the
|
439
|
+
# matched substring.
|
440
|
+
#
|
441
|
+
#
|
442
|
+
# The pattern below uses positive lookahead and positive lookbehind to match
|
443
|
+
# text appearing in **...** tags without including the tags in the match:
|
444
|
+
#
|
445
|
+
# /(?<=<b>)\w+(?=<\/b>)/.match("Fortune favors the <b>bold</b>.")
|
446
|
+
# # => #<MatchData "bold">
|
447
|
+
#
|
448
|
+
# #### Match-Reset Anchor
|
449
|
+
#
|
450
|
+
# * `\K`: Match reset: the matched content preceding `\K` in the regexp is
|
451
|
+
# excluded from the result. For example, the following two regexps are
|
452
|
+
# almost equivalent:
|
453
|
+
#
|
454
|
+
# /ab\Kc/.match('abc') # => #<MatchData "c">
|
455
|
+
# /(?<=ab)c/.match('abc') # => #<MatchData "c">
|
456
|
+
#
|
457
|
+
# These match same string and `$&` equals `'c'`, while the matched position
|
458
|
+
# is different.
|
459
|
+
#
|
460
|
+
# As are the following two regexps:
|
461
|
+
#
|
462
|
+
# /(a)\K(b)\Kc/
|
463
|
+
# /(?<=(?<=(a))(b))c/
|
464
|
+
#
|
465
|
+
#
|
466
|
+
# ### Alternation
|
467
|
+
#
|
468
|
+
# The vertical bar metacharacter (`|`) may be used within parentheses to express
|
469
|
+
# alternation: two or more subexpressions any of which may match the target
|
470
|
+
# string.
|
471
|
+
#
|
472
|
+
# Two alternatives:
|
473
|
+
#
|
474
|
+
# re = /(a|b)/
|
475
|
+
# re.match('foo') # => nil
|
476
|
+
# re.match('bar') # => #<MatchData "b" 1:"b">
|
477
|
+
#
|
478
|
+
# Four alternatives:
|
479
|
+
#
|
480
|
+
# re = /(a|b|c|d)/
|
481
|
+
# re.match('shazam') # => #<MatchData "a" 1:"a">
|
482
|
+
# re.match('cold') # => #<MatchData "c" 1:"c">
|
483
|
+
#
|
484
|
+
# Each alternative is a subexpression, and may be composed of other
|
485
|
+
# subexpressions:
|
486
|
+
#
|
487
|
+
# re = /([a-c]|[x-z])/
|
488
|
+
# re.match('bar') # => #<MatchData "b" 1:"b">
|
489
|
+
# re.match('ooz') # => #<MatchData "z" 1:"z">
|
490
|
+
#
|
491
|
+
# Method Regexp.union provides a convenient way to construct a regexp with
|
492
|
+
# alternatives.
|
493
|
+
#
|
494
|
+
# ### Quantifiers
|
495
|
+
#
|
496
|
+
# A simple regexp matches one character:
|
497
|
+
#
|
498
|
+
# /\w/.match('Hello') # => #<MatchData "H">
|
499
|
+
#
|
500
|
+
# An added *quantifier* specifies how many matches are required or allowed:
|
501
|
+
#
|
502
|
+
# * `*` - Matches zero or more times:
|
503
|
+
#
|
504
|
+
# /\w*/.match('')
|
505
|
+
# # => #<MatchData "">
|
506
|
+
# /\w*/.match('x')
|
507
|
+
# # => #<MatchData "x">
|
508
|
+
# /\w*/.match('xyz')
|
509
|
+
# # => #<MatchData "yz">
|
510
|
+
#
|
511
|
+
# * `+` - Matches one or more times:
|
512
|
+
#
|
513
|
+
# /\w+/.match('') # => nil
|
514
|
+
# /\w+/.match('x') # => #<MatchData "x">
|
515
|
+
# /\w+/.match('xyz') # => #<MatchData "xyz">
|
516
|
+
#
|
517
|
+
# * `?` - Matches zero or one times:
|
518
|
+
#
|
519
|
+
# /\w?/.match('') # => #<MatchData "">
|
520
|
+
# /\w?/.match('x') # => #<MatchData "x">
|
521
|
+
# /\w?/.match('xyz') # => #<MatchData "x">
|
522
|
+
#
|
523
|
+
# * `{`*n*`}` - Matches exactly *n* times:
|
524
|
+
#
|
525
|
+
# /\w{2}/.match('') # => nil
|
526
|
+
# /\w{2}/.match('x') # => nil
|
527
|
+
# /\w{2}/.match('xyz') # => #<MatchData "xy">
|
528
|
+
#
|
529
|
+
# * `{`*min*`,}` - Matches *min* or more times:
|
530
|
+
#
|
531
|
+
# /\w{2,}/.match('') # => nil
|
532
|
+
# /\w{2,}/.match('x') # => nil
|
533
|
+
# /\w{2,}/.match('xy') # => #<MatchData "xy">
|
534
|
+
# /\w{2,}/.match('xyz') # => #<MatchData "xyz">
|
535
|
+
#
|
536
|
+
# * `{,`*max*`}` - Matches *max* or fewer times:
|
537
|
+
#
|
538
|
+
# /\w{,2}/.match('') # => #<MatchData "">
|
539
|
+
# /\w{,2}/.match('x') # => #<MatchData "x">
|
540
|
+
# /\w{,2}/.match('xyz') # => #<MatchData "xy">
|
541
|
+
#
|
542
|
+
# * `{`*min*`,`*max*`}` - Matches at least *min* times and at most *max*
|
543
|
+
# times:
|
544
|
+
#
|
545
|
+
# /\w{1,2}/.match('') # => nil
|
546
|
+
# /\w{1,2}/.match('x') # => #<MatchData "x">
|
547
|
+
# /\w{1,2}/.match('xyz') # => #<MatchData "xy">
|
548
|
+
#
|
549
|
+
#
|
550
|
+
# #### Greedy, Lazy, or Possessive Matching
|
551
|
+
#
|
552
|
+
# Quantifier matching may be greedy, lazy, or possessive:
|
553
|
+
#
|
554
|
+
# * In *greedy* matching, as many occurrences as possible are matched while
|
555
|
+
# still allowing the overall match to succeed. Greedy quantifiers: `*`, `+`,
|
556
|
+
# `?`, `{min, max}` and its variants.
|
557
|
+
# * In *lazy* matching, the minimum number of occurrences are matched. Lazy
|
558
|
+
# quantifiers: `*?`, `+?`, `??`, `{min, max}?` and its variants.
|
559
|
+
# * In *possessive* matching, once a match is found, there is no backtracking;
|
560
|
+
# that match is retained, even if it jeopardises the overall match.
|
561
|
+
# Possessive quantifiers: `*+`, `++`, `?+`. Note that `{min, max}` and its
|
562
|
+
# variants do *not* support possessive matching.
|
563
|
+
#
|
564
|
+
#
|
565
|
+
# More:
|
566
|
+
#
|
567
|
+
# * About greedy and lazy matching, see [Choosing Minimal or Maximal
|
568
|
+
# Repetition](https://doc.lagout.org/programmation/Regular%20Expressions/Reg
|
569
|
+
# ular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Progr
|
570
|
+
# amming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%20201
|
571
|
+
# 2-09-06%5D.pdf#tutorial-backtrack).
|
572
|
+
# * About possessive matching, see [Eliminate Needless
|
573
|
+
# Backtracking](https://doc.lagout.org/programmation/Regular%20Expressions/R
|
574
|
+
# egular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Pro
|
575
|
+
# gramming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202
|
576
|
+
# 012-09-06%5D.pdf#tutorial-backtrack).
|
577
|
+
#
|
578
|
+
#
|
579
|
+
# ### Groups and Captures
|
580
|
+
#
|
581
|
+
# A simple regexp has (at most) one match:
|
582
|
+
#
|
583
|
+
# re = /\d\d\d\d-\d\d-\d\d/
|
584
|
+
# re.match('1943-02-04') # => #<MatchData "1943-02-04">
|
585
|
+
# re.match('1943-02-04').size # => 1
|
586
|
+
# re.match('foo') # => nil
|
587
|
+
#
|
588
|
+
# Adding one or more pairs of parentheses, `(*subexpression*)`, defines
|
589
|
+
# *groups*, which may result in multiple matched substrings, called *captures*:
|
590
|
+
#
|
591
|
+
# re = /(\d\d\d\d)-(\d\d)-(\d\d)/
|
592
|
+
# re.match('1943-02-04') # => #<MatchData "1943-02-04" 1:"1943" 2:"02" 3:"04">
|
593
|
+
# re.match('1943-02-04').size # => 4
|
594
|
+
#
|
595
|
+
# The first capture is the entire matched string; the other captures are the
|
596
|
+
# matched substrings from the groups.
|
597
|
+
#
|
598
|
+
# A group may have a [quantifier](rdoc-ref:Regexp@Quantifiers):
|
599
|
+
#
|
600
|
+
# re = /July 4(th)?/
|
601
|
+
# re.match('July 4') # => #<MatchData "July 4" 1:nil>
|
602
|
+
# re.match('July 4th') # => #<MatchData "July 4th" 1:"th">
|
603
|
+
#
|
604
|
+
# re = /(foo)*/
|
605
|
+
# re.match('') # => #<MatchData "" 1:nil>
|
606
|
+
# re.match('foo') # => #<MatchData "foo" 1:"foo">
|
607
|
+
# re.match('foofoo') # => #<MatchData "foofoo" 1:"foo">
|
608
|
+
#
|
609
|
+
# re = /(foo)+/
|
610
|
+
# re.match('') # => nil
|
611
|
+
# re.match('foo') # => #<MatchData "foo" 1:"foo">
|
612
|
+
# re.match('foofoo') # => #<MatchData "foofoo" 1:"foo">
|
613
|
+
#
|
614
|
+
# The returned MatchData object gives access to the matched substrings:
|
615
|
+
#
|
616
|
+
# re = /(\d\d\d\d)-(\d\d)-(\d\d)/
|
617
|
+
# md = re.match('1943-02-04')
|
618
|
+
# # => #<MatchData "1943-02-04" 1:"1943" 2:"02" 3:"04">
|
619
|
+
# md[0] # => "1943-02-04"
|
620
|
+
# md[1] # => "1943"
|
621
|
+
# md[2] # => "02"
|
622
|
+
# md[3] # => "04"
|
267
623
|
#
|
268
|
-
#
|
269
|
-
# simultaneously. Also, if a named capture is used in a regexp, then parentheses
|
270
|
-
# used for grouping which would otherwise result in a unnamed capture are
|
271
|
-
# treated as non-capturing.
|
624
|
+
# #### Non-Capturing Groups
|
272
625
|
#
|
273
|
-
#
|
274
|
-
#
|
626
|
+
# A group may be made non-capturing; it is still a group (and, for example, can
|
627
|
+
# have a quantifier), but its matching substring is not included among the
|
628
|
+
# captures.
|
275
629
|
#
|
276
|
-
#
|
277
|
-
# /(?<c>\w)(\w)/.match("ab").named_captures # => {"c"=>"a"}
|
630
|
+
# A non-capturing group begins with `?:` (inside the parentheses):
|
278
631
|
#
|
279
|
-
#
|
280
|
-
#
|
281
|
-
#
|
632
|
+
# # Don't capture the year.
|
633
|
+
# re = /(?:\d\d\d\d)-(\d\d)-(\d\d)/
|
634
|
+
# md = re.match('1943-02-04') # => #<MatchData "1943-02-04" 1:"02" 2:"04">
|
282
635
|
#
|
283
|
-
#
|
284
|
-
# dollars #=> "3"
|
636
|
+
# #### Backreferences
|
285
637
|
#
|
286
|
-
#
|
638
|
+
# A group match may also be referenced within the regexp itself; such a
|
639
|
+
# reference is called a `backreference`:
|
287
640
|
#
|
288
|
-
#
|
289
|
-
#
|
641
|
+
# /[csh](..) [csh]\1 in/.match('The cat sat in the hat')
|
642
|
+
# # => #<MatchData "cat sat in" 1:"at">
|
290
643
|
#
|
291
|
-
#
|
644
|
+
# This table shows how each subexpression in the regexp above matches a
|
645
|
+
# substring in the target string:
|
292
646
|
#
|
293
|
-
#
|
647
|
+
# | Subexpression in Regexp | Matching Substring in Target String |
|
648
|
+
# |---------------------------|-------------------------------------|
|
649
|
+
# | First '[csh]' | Character 'c' |
|
650
|
+
# | '(..)' | First substring 'at' |
|
651
|
+
# | First space ' ' | First space character ' ' |
|
652
|
+
# | Second '[csh]' | Character 's' |
|
653
|
+
# | '\1' (backreference 'at') | Second substring 'at' |
|
654
|
+
# | ' in' | Substring ' in' |
|
294
655
|
#
|
295
|
-
#
|
296
|
-
# twice, i.e. `[aeiou]\w[aeiou]\w`: 'enor'.
|
656
|
+
# A regexp may contain any number of groups:
|
297
657
|
#
|
298
|
-
#
|
299
|
-
# #=> #<MatchData "enor" 1:"or">
|
658
|
+
# * For a large number of groups:
|
300
659
|
#
|
301
|
-
#
|
302
|
-
#
|
303
|
-
# backreference. This benefits performance at the slight expense of readability.
|
660
|
+
# * The ordinary `\*n`* notation applies only for *n* in range (1..9).
|
661
|
+
# * The `MatchData[*n*]` notation applies for any non-negative *n*.
|
304
662
|
#
|
305
|
-
# The first group of parentheses captures 'n' and the second 'ti'. The second
|
306
|
-
# group is referred to later with the backreference `\2`:
|
307
663
|
#
|
308
|
-
#
|
309
|
-
#
|
664
|
+
# * `\0` is a special backreference, referring to the entire matched string;
|
665
|
+
# it may not be used within the regexp itself, but may be used outside it
|
666
|
+
# (for example, in a substitution method call):
|
310
667
|
#
|
311
|
-
#
|
312
|
-
#
|
313
|
-
# backreference `\1` now refers to 'ti'.
|
668
|
+
# 'The cat sat in the hat'.gsub(/[csh]at/, '\0s')
|
669
|
+
# # => "The cats sats in the hats"
|
314
670
|
#
|
315
|
-
# /I(?:n)ves(ti)ga\1ons/.match("Investigations")
|
316
|
-
# #=> #<MatchData "Investigations" 1:"ti">
|
317
671
|
#
|
318
|
-
#
|
672
|
+
# #### Named Captures
|
319
673
|
#
|
320
|
-
#
|
321
|
-
#
|
322
|
-
#
|
323
|
-
# the entire subexpression must be abandoned and subsequently revisited. In this
|
324
|
-
# way *pat* is treated as a non-divisible whole. Atomic grouping is typically
|
325
|
-
# used to optimise patterns so as to prevent the regular expression engine from
|
326
|
-
# backtracking needlessly.
|
674
|
+
# As seen above, a capture can be referred to by its number. A capture can also
|
675
|
+
# have a name, prefixed as `?<*name*>` or `?'*name*'`, and the name (symbolized)
|
676
|
+
# may be used as an index in `MatchData[]`:
|
327
677
|
#
|
328
|
-
#
|
329
|
-
#
|
330
|
-
#
|
331
|
-
#
|
678
|
+
# md = /\$(?<dollars>\d+)\.(?'cents'\d+)/.match("$3.67")
|
679
|
+
# # => #<MatchData "$3.67" dollars:"3" cents:"67">
|
680
|
+
# md[:dollars] # => "3"
|
681
|
+
# md[:cents] # => "67"
|
682
|
+
# # The capture numbers are still valid.
|
683
|
+
# md[2] # => "67"
|
332
684
|
#
|
333
|
-
#
|
685
|
+
# When a regexp contains a named capture, there are no unnamed captures:
|
334
686
|
#
|
335
|
-
#
|
336
|
-
#
|
687
|
+
# /\$(?<dollars>\d+)\.(\d+)/.match("$3.67")
|
688
|
+
# # => #<MatchData "$3.67" dollars:"3">
|
337
689
|
#
|
338
|
-
#
|
690
|
+
# A named group may be backreferenced as `\k<*name*>`:
|
339
691
|
#
|
340
|
-
#
|
692
|
+
# /(?<vowel>[aeiou]).\k<vowel>.\k<vowel>/.match('ototomy')
|
693
|
+
# # => #<MatchData "ototo" vowel:"o">
|
694
|
+
#
|
695
|
+
# When (and only when) a regexp contains named capture groups and appears before
|
696
|
+
# the `=~` operator, the captured substrings are assigned to local variables
|
697
|
+
# with corresponding names:
|
698
|
+
#
|
699
|
+
# /\$(?<dollars>\d+)\.(?<cents>\d+)/ =~ '$3.67'
|
700
|
+
# dollars # => "3"
|
701
|
+
# cents # => "67"
|
702
|
+
#
|
703
|
+
# Method Regexp#named_captures returns a hash of the capture names and
|
704
|
+
# substrings; method Regexp#names returns an array of the capture names.
|
705
|
+
#
|
706
|
+
# #### Atomic Grouping
|
707
|
+
#
|
708
|
+
# A group may be made *atomic* with `(?>`*subexpression*`)`.
|
709
|
+
#
|
710
|
+
# This causes the subexpression to be matched independently of the rest of the
|
711
|
+
# expression, so that the matched substring becomes fixed for the remainder of
|
712
|
+
# the match, unless the entire subexpression must be abandoned and subsequently
|
713
|
+
# revisited.
|
714
|
+
#
|
715
|
+
# In this way *subexpression* is treated as a non-divisible whole. Atomic
|
716
|
+
# grouping is typically used to optimise patterns to prevent needless
|
717
|
+
# backtracking .
|
718
|
+
#
|
719
|
+
# Example (without atomic grouping):
|
341
720
|
#
|
342
|
-
#
|
343
|
-
# which can be a group name or number, again. This differs from backreferences
|
344
|
-
# in that it re-executes the group rather than simply trying to re-match the
|
345
|
-
# same text.
|
721
|
+
# /".*"/.match('"Quote"') # => #<MatchData "\"Quote\"">
|
346
722
|
#
|
347
|
-
#
|
348
|
-
# tries to call that the `paren` sub-expression again but fails, then matches a
|
349
|
-
# literal *)*:
|
723
|
+
# Analysis:
|
350
724
|
#
|
351
|
-
#
|
725
|
+
# 1. The leading subexpression `"` in the pattern matches the first character
|
726
|
+
# `"` in the target string.
|
727
|
+
# 2. The next subexpression `.*` matches the next substring `Quote“` (including
|
728
|
+
# the trailing double-quote).
|
729
|
+
# 3. Now there is nothing left in the target string to match the trailing
|
730
|
+
# subexpression `"` in the pattern; this would cause the overall match to
|
731
|
+
# fail.
|
732
|
+
# 4. The matched substring is backtracked by one position: `Quote`.
|
733
|
+
# 5. The final subexpression `"` now matches the final substring `"`, and the
|
734
|
+
# overall match succeeds.
|
352
735
|
#
|
353
|
-
#
|
736
|
+
#
|
737
|
+
# If subexpression `.*` is grouped atomically, the backtracking is disabled, and
|
738
|
+
# the overall match fails:
|
739
|
+
#
|
740
|
+
# /"(?>.*)"/.match('"Quote"') # => nil
|
741
|
+
#
|
742
|
+
# Atomic grouping can affect performance; see [Atomic
|
743
|
+
# Group](https://www.regular-expressions.info/atomic.html).
|
744
|
+
#
|
745
|
+
# #### Subexpression Calls
|
746
|
+
#
|
747
|
+
# As seen above, a backreference number (`\*n`*) or name (`\k<*name*>`) gives
|
748
|
+
# access to a captured *substring*; the corresponding regexp *subexpression* may
|
749
|
+
# also be accessed, via the number (`\\g*n`*) or name (`\g<*name*>`):
|
750
|
+
#
|
751
|
+
# /\A(?<paren>\(\g<paren>*\))*\z/.match('(())')
|
354
752
|
# # ^1
|
355
753
|
# # ^2
|
356
754
|
# # ^3
|
@@ -362,415 +760,576 @@
|
|
362
760
|
# # ^9
|
363
761
|
# # ^10
|
364
762
|
#
|
763
|
+
# The pattern:
|
764
|
+
#
|
365
765
|
# 1. Matches at the beginning of the string, i.e. before the first character.
|
366
|
-
# 2. Enters a named
|
367
|
-
# 3. Matches
|
368
|
-
# 4. Calls the `paren` group again, i.e. recurses back to the
|
369
|
-
# 5. Re-enters the `paren` group
|
370
|
-
# 6. Matches
|
371
|
-
# 7.
|
372
|
-
# an overall successful match
|
373
|
-
# 8.
|
374
|
-
#
|
375
|
-
# 9.
|
376
|
-
# 10.
|
377
|
-
#
|
378
|
-
#
|
379
|
-
#
|
380
|
-
#
|
381
|
-
#
|
382
|
-
#
|
383
|
-
#
|
384
|
-
#
|
385
|
-
#
|
386
|
-
#
|
387
|
-
#
|
388
|
-
#
|
389
|
-
#
|
390
|
-
#
|
391
|
-
#
|
392
|
-
#
|
393
|
-
#
|
394
|
-
#
|
395
|
-
#
|
396
|
-
#
|
397
|
-
#
|
398
|
-
#
|
399
|
-
#
|
400
|
-
#
|
401
|
-
#
|
402
|
-
#
|
403
|
-
#
|
404
|
-
#
|
405
|
-
#
|
766
|
+
# 2. Enters a named group `paren`.
|
767
|
+
# 3. Matches the first character in the string, `'('`.
|
768
|
+
# 4. Calls the `paren` group again, i.e. recurses back to the second step.
|
769
|
+
# 5. Re-enters the `paren` group.
|
770
|
+
# 6. Matches the second character in the string, `'('`.
|
771
|
+
# 7. Attempts to call `paren` a third time, but fails because doing so would
|
772
|
+
# prevent an overall successful match.
|
773
|
+
# 8. Matches the third character in the string, `')'`; marks the end of the
|
774
|
+
# second recursive call
|
775
|
+
# 9. Matches the fourth character in the string, `')'`.
|
776
|
+
# 10. Matches the end of the string.
|
777
|
+
#
|
778
|
+
#
|
779
|
+
# See [Subexpression
|
780
|
+
# calls](https://learnbyexample.github.io/Ruby_Regexp/groupings-and-backreferenc
|
781
|
+
# es.html?highlight=subexpression#subexpression-calls).
|
782
|
+
#
|
783
|
+
# #### Conditionals
|
784
|
+
#
|
785
|
+
# The conditional construct takes the form `(?(*cond*)*yes*|*no*)`, where:
|
786
|
+
#
|
787
|
+
# * *cond* may be a capture number or name.
|
788
|
+
# * The match to be applied is *yes* if *cond* is captured; otherwise the
|
789
|
+
# match to be applied is *no*.
|
790
|
+
# * If not needed, `|*no`* may be omitted.
|
791
|
+
#
|
792
|
+
#
|
793
|
+
# Examples:
|
794
|
+
#
|
795
|
+
# re = /\A(foo)?(?(1)(T)|(F))\z/
|
796
|
+
# re.match('fooT') # => #<MatchData "fooT" 1:"foo" 2:"T" 3:nil>
|
797
|
+
# re.match('F') # => #<MatchData "F" 1:nil 2:nil 3:"F">
|
798
|
+
# re.match('fooF') # => nil
|
799
|
+
# re.match('T') # => nil
|
800
|
+
#
|
801
|
+
# re = /\A(?<xyzzy>foo)?(?(<xyzzy>)(T)|(F))\z/
|
802
|
+
# re.match('fooT') # => #<MatchData "fooT" xyzzy:"foo">
|
803
|
+
# re.match('F') # => #<MatchData "F" xyzzy:nil>
|
804
|
+
# re.match('fooF') # => nil
|
805
|
+
# re.match('T') # => nil
|
806
|
+
#
|
807
|
+
# #### Absence Operator
|
808
|
+
#
|
809
|
+
# The absence operator is a special group that matches anything which does *not*
|
810
|
+
# match the contained subexpressions.
|
811
|
+
#
|
812
|
+
# /(?~real)/.match('surrealist') # => #<MatchData "surrea">
|
813
|
+
# /(?~real)ist/.match('surrealist') # => #<MatchData "ealist">
|
814
|
+
# /sur(?~real)ist/.match('surrealist') # => nil
|
815
|
+
#
|
816
|
+
# ### Unicode
|
817
|
+
#
|
818
|
+
# #### Unicode Properties
|
819
|
+
#
|
820
|
+
# The `/\p{*property_name*}/` construct (with lowercase `p`) matches characters
|
821
|
+
# using a Unicode property name, much like a character class; property `Alpha`
|
822
|
+
# specifies alphabetic characters:
|
823
|
+
#
|
824
|
+
# /\p{Alpha}/.match('a') # => #<MatchData "a">
|
825
|
+
# /\p{Alpha}/.match('1') # => nil
|
826
|
+
#
|
827
|
+
# A property can be inverted by prefixing the name with a caret character (`^`):
|
828
|
+
#
|
829
|
+
# /\p{^Alpha}/.match('1') # => #<MatchData "1">
|
830
|
+
# /\p{^Alpha}/.match('a') # => nil
|
831
|
+
#
|
832
|
+
# Or by using `\P` (uppercase `P`):
|
833
|
+
#
|
834
|
+
# /\P{Alpha}/.match('1') # => #<MatchData "1">
|
835
|
+
# /\P{Alpha}/.match('a') # => nil
|
836
|
+
#
|
837
|
+
# See [Unicode Properties](rdoc-ref:regexp/unicode_properties.rdoc) for regexps
|
838
|
+
# based on the numerous properties.
|
839
|
+
#
|
840
|
+
# Some commonly-used properties correspond to POSIX bracket expressions:
|
841
|
+
#
|
842
|
+
# * `/\p{Alnum}/`: Alphabetic and numeric character
|
843
|
+
# * `/\p{Alpha}/`: Alphabetic character
|
844
|
+
# * `/\p{Blank}/`: Space or tab
|
845
|
+
# * `/\p{Cntrl}/`: Control character
|
846
|
+
# * `/\p{Digit}/`: Digit characters, and similar)
|
847
|
+
# * `/\p{Lower}/`: Lowercase alphabetical character
|
848
|
+
# * `/\p{Print}/`: Like `\p{Graph}`, but includes the space character
|
849
|
+
# * `/\p{Punct}/`: Punctuation character
|
850
|
+
# * `/\p{Space}/`: Whitespace character (`[:blank:]`, newline, carriage
|
406
851
|
# return, etc.)
|
407
|
-
# * `/\p{Upper}
|
408
|
-
# * `/\p{XDigit}
|
409
|
-
# * `/\p{Word}/` - A member of one of the following Unicode general category
|
410
|
-
# *Letter*, *Mark*, *Number*, *Connector_Punctuation*
|
411
|
-
# * `/\p{ASCII}/` - A character in the ASCII character set
|
412
|
-
# * `/\p{Any}/` - Any Unicode character (including unassigned characters)
|
413
|
-
# * `/\p{Assigned}/` - An assigned character
|
414
|
-
#
|
415
|
-
#
|
416
|
-
# A Unicode character's *General Category* value can also be matched with
|
417
|
-
# `\p{`*Ab*`}` where *Ab* is the category's abbreviation as described below:
|
418
|
-
#
|
419
|
-
# * `/\p{L}/` - 'Letter'
|
420
|
-
# * `/\p{Ll}/` - 'Letter: Lowercase'
|
421
|
-
# * `/\p{Lm}/` - 'Letter: Mark'
|
422
|
-
# * `/\p{Lo}/` - 'Letter: Other'
|
423
|
-
# * `/\p{Lt}/` - 'Letter: Titlecase'
|
424
|
-
# * `/\p{Lu}/` - 'Letter: Uppercase
|
425
|
-
# * `/\p{Lo}/` - 'Letter: Other'
|
426
|
-
# * `/\p{M}/` - 'Mark'
|
427
|
-
# * `/\p{Mn}/` - 'Mark: Nonspacing'
|
428
|
-
# * `/\p{Mc}/` - 'Mark: Spacing Combining'
|
429
|
-
# * `/\p{Me}/` - 'Mark: Enclosing'
|
430
|
-
# * `/\p{N}/` - 'Number'
|
431
|
-
# * `/\p{Nd}/` - 'Number: Decimal Digit'
|
432
|
-
# * `/\p{Nl}/` - 'Number: Letter'
|
433
|
-
# * `/\p{No}/` - 'Number: Other'
|
434
|
-
# * `/\p{P}/` - 'Punctuation'
|
435
|
-
# * `/\p{Pc}/` - 'Punctuation: Connector'
|
436
|
-
# * `/\p{Pd}/` - 'Punctuation: Dash'
|
437
|
-
# * `/\p{Ps}/` - 'Punctuation: Open'
|
438
|
-
# * `/\p{Pe}/` - 'Punctuation: Close'
|
439
|
-
# * `/\p{Pi}/` - 'Punctuation: Initial Quote'
|
440
|
-
# * `/\p{Pf}/` - 'Punctuation: Final Quote'
|
441
|
-
# * `/\p{Po}/` - 'Punctuation: Other'
|
442
|
-
# * `/\p{S}/` - 'Symbol'
|
443
|
-
# * `/\p{Sm}/` - 'Symbol: Math'
|
444
|
-
# * `/\p{Sc}/` - 'Symbol: Currency'
|
445
|
-
# * `/\p{Sc}/` - 'Symbol: Currency'
|
446
|
-
# * `/\p{Sk}/` - 'Symbol: Modifier'
|
447
|
-
# * `/\p{So}/` - 'Symbol: Other'
|
448
|
-
# * `/\p{Z}/` - 'Separator'
|
449
|
-
# * `/\p{Zs}/` - 'Separator: Space'
|
450
|
-
# * `/\p{Zl}/` - 'Separator: Line'
|
451
|
-
# * `/\p{Zp}/` - 'Separator: Paragraph'
|
452
|
-
# * `/\p{C}/` - 'Other'
|
453
|
-
# * `/\p{Cc}/` - 'Other: Control'
|
454
|
-
# * `/\p{Cf}/` - 'Other: Format'
|
455
|
-
# * `/\p{Cn}/` - 'Other: Not Assigned'
|
456
|
-
# * `/\p{Co}/` - 'Other: Private Use'
|
457
|
-
# * `/\p{Cs}/` - 'Other: Surrogate'
|
458
|
-
#
|
459
|
-
#
|
460
|
-
# Lastly, `\p{}` matches a character's Unicode *script*. The following scripts
|
461
|
-
# are supported: *Arabic*, *Armenian*, *Balinese*, *Bengali*, *Bopomofo*,
|
462
|
-
# *Braille*, *Buginese*, *Buhid*, *Canadian_Aboriginal*, *Carian*, *Cham*,
|
463
|
-
# *Cherokee*, *Common*, *Coptic*, *Cuneiform*, *Cypriot*, *Cyrillic*, *Deseret*,
|
464
|
-
# *Devanagari*, *Ethiopic*, *Georgian*, *Glagolitic*, *Gothic*, *Greek*,
|
465
|
-
# *Gujarati*, *Gurmukhi*, *Han*, *Hangul*, *Hanunoo*, *Hebrew*, *Hiragana*,
|
466
|
-
# *Inherited*, *Kannada*, *Katakana*, *Kayah_Li*, *Kharoshthi*, *Khmer*, *Lao*,
|
467
|
-
# *Latin*, *Lepcha*, *Limbu*, *Linear_B*, *Lycian*, *Lydian*, *Malayalam*,
|
468
|
-
# *Mongolian*, *Myanmar*, *New_Tai_Lue*, *Nko*, *Ogham*, *Ol_Chiki*,
|
469
|
-
# *Old_Italic*, *Old_Persian*, *Oriya*, *Osmanya*, *Phags_Pa*, *Phoenician*,
|
470
|
-
# *Rejang*, *Runic*, *Saurashtra*, *Shavian*, *Sinhala*, *Sundanese*,
|
471
|
-
# *Syloti_Nagri*, *Syriac*, *Tagalog*, *Tagbanwa*, *Tai_Le*, *Tamil*, *Telugu*,
|
472
|
-
# *Thaana*, *Thai*, *Tibetan*, *Tifinagh*, *Ugaritic*, *Vai*, and *Yi*.
|
473
|
-
#
|
474
|
-
# Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and belongs to the
|
475
|
-
# Arabic script:
|
476
|
-
#
|
477
|
-
# /\p{Arabic}/.match("\u06E9") #=> #<MatchData "\u06E9">
|
478
|
-
#
|
479
|
-
# All character properties can be inverted by prefixing their name with a caret
|
480
|
-
# (`^`).
|
481
|
-
#
|
482
|
-
# Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so this
|
483
|
-
# match succeeds:
|
484
|
-
#
|
485
|
-
# /\p{^Ll}/.match("A") #=> #<MatchData "A">
|
486
|
-
#
|
487
|
-
# ## Anchors
|
488
|
-
#
|
489
|
-
# Anchors are metacharacter that match the zero-width positions between
|
490
|
-
# characters, *anchoring* the match to a specific position.
|
491
|
-
#
|
492
|
-
# * `^` - Matches beginning of line
|
493
|
-
# * `$` - Matches end of line
|
494
|
-
# * `\A` - Matches beginning of string.
|
495
|
-
# * `\Z` - Matches end of string. If string ends with a newline, it matches
|
496
|
-
# just before newline
|
497
|
-
# * `\z` - Matches end of string
|
498
|
-
# * `\G` - Matches first matching position:
|
499
|
-
#
|
500
|
-
# In methods like `String#gsub` and `String#scan`, it changes on each
|
501
|
-
# iteration. It initially matches the beginning of subject, and in each
|
502
|
-
# following iteration it matches where the last match finished.
|
503
|
-
#
|
504
|
-
# " a b c".gsub(/ /, '_') #=> "____a_b_c"
|
505
|
-
# " a b c".gsub(/\G /, '_') #=> "____a b c"
|
506
|
-
#
|
507
|
-
# In methods like `Regexp#match` and `String#match` that take an (optional)
|
508
|
-
# offset, it matches where the search begins.
|
852
|
+
# * `/\p{Upper}/`: Uppercase alphabetical
|
853
|
+
# * `/\p{XDigit}/`: Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
|
509
854
|
#
|
510
|
-
# "hello, world".match(/,/, 3) #=> #<MatchData ",">
|
511
|
-
# "hello, world".match(/\G,/, 3) #=> nil
|
512
855
|
#
|
513
|
-
#
|
514
|
-
# when inside brackets
|
515
|
-
# * `\B` - Matches non-word boundaries
|
516
|
-
# * `(?=`*pat*`)` - *Positive lookahead* assertion: ensures that the following
|
517
|
-
# characters match *pat*, but doesn't include those characters in the
|
518
|
-
# matched text
|
519
|
-
# * `(?!`*pat*`)` - *Negative lookahead* assertion: ensures that the following
|
520
|
-
# characters do not match *pat*, but doesn't include those characters in the
|
521
|
-
# matched text
|
522
|
-
# * `(?<=`*pat*`)` - *Positive lookbehind* assertion: ensures that the
|
523
|
-
# preceding characters match *pat*, but doesn't include those characters in
|
524
|
-
# the matched text
|
525
|
-
# * `(?<!`*pat*`)` - *Negative lookbehind* assertion: ensures that the
|
526
|
-
# preceding characters do not match *pat*, but doesn't include those
|
527
|
-
# characters in the matched text
|
528
|
-
#
|
529
|
-
# * `\K` - *Match reset*: the matched content preceding `\K` in the regexp is
|
530
|
-
# excluded from the result. For example, the following two regexps are
|
531
|
-
# almost equivalent:
|
856
|
+
# These are also commonly used:
|
532
857
|
#
|
533
|
-
#
|
534
|
-
#
|
858
|
+
# * `/\p{Emoji}/`: Unicode emoji.
|
859
|
+
# * `/\p{Graph}/`: Non-blank character (excludes spaces, control characters,
|
860
|
+
# and similar).
|
861
|
+
# * `/\p{Word}/`: A member in one of these Unicode character categories (see
|
862
|
+
# below) or having one of these Unicode properties:
|
535
863
|
#
|
536
|
-
#
|
537
|
-
#
|
864
|
+
# * Unicode categories:
|
865
|
+
# * `Mark` (`M`).
|
866
|
+
# * `Decimal Number` (`Nd`)
|
867
|
+
# * `Connector Punctuation` (`Pc`).
|
538
868
|
#
|
539
|
-
# As are the following two regexps:
|
540
869
|
#
|
541
|
-
#
|
542
|
-
#
|
870
|
+
# * Unicode properties:
|
871
|
+
# * `Alpha`
|
872
|
+
# * `Join_Control`
|
543
873
|
#
|
544
874
|
#
|
545
|
-
# If a pattern isn't anchored it can begin at any point in the string:
|
546
875
|
#
|
547
|
-
#
|
876
|
+
# * `/\p{ASCII}/`: A character in the ASCII character set.
|
877
|
+
# * `/\p{Any}/`: Any Unicode character (including unassigned characters).
|
878
|
+
# * `/\p{Assigned}/`: An assigned character.
|
548
879
|
#
|
549
|
-
# Anchoring the pattern to the beginning of the string forces the match to start
|
550
|
-
# there. 'real' doesn't occur at the beginning of the string, so now the match
|
551
|
-
# fails:
|
552
880
|
#
|
553
|
-
#
|
881
|
+
# #### Unicode Character Categories
|
554
882
|
#
|
555
|
-
#
|
556
|
-
# does not occur at a word boundary.
|
883
|
+
# A Unicode character category name:
|
557
884
|
#
|
558
|
-
#
|
885
|
+
# * May be either its full name or its abbreviated name.
|
886
|
+
# * Is case-insensitive.
|
887
|
+
# * Treats a space, a hyphen, and an underscore as equivalent.
|
559
888
|
#
|
560
|
-
# Whereas in the following example 'and' has been anchored to a non-word
|
561
|
-
# boundary so instead of matching the first 'and' it matches from the fourth
|
562
|
-
# letter of 'demand' instead:
|
563
889
|
#
|
564
|
-
#
|
890
|
+
# Examples:
|
565
891
|
#
|
566
|
-
#
|
567
|
-
#
|
892
|
+
# /\p{lu}/ # => /\p{lu}/
|
893
|
+
# /\p{LU}/ # => /\p{LU}/
|
894
|
+
# /\p{Uppercase Letter}/ # => /\p{Uppercase Letter}/
|
895
|
+
# /\p{Uppercase_Letter}/ # => /\p{Uppercase_Letter}/
|
896
|
+
# /\p{UPPERCASE-LETTER}/ # => /\p{UPPERCASE-LETTER}/
|
568
897
|
#
|
569
|
-
#
|
570
|
-
#
|
898
|
+
# Below are the Unicode character category abbreviations and names. Enumerations
|
899
|
+
# of characters in each category are at the links.
|
571
900
|
#
|
572
|
-
#
|
901
|
+
# Letters:
|
573
902
|
#
|
574
|
-
#
|
575
|
-
#
|
903
|
+
# * `L`, `Letter`: `LC`, `Lm`, or `Lo`.
|
904
|
+
# * `LC`, `Cased_Letter`: `Ll`, `Lt`, or `Lu`.
|
905
|
+
# * [Lu, Lowercase_Letter](https://www.compart.com/en/unicode/category/Ll).
|
906
|
+
# * [Lu, Modifier_Letter](https://www.compart.com/en/unicode/category/Lm).
|
907
|
+
# * [Lu, Other_Letter](https://www.compart.com/en/unicode/category/Lo).
|
908
|
+
# * [Lu, Titlecase_Letter](https://www.compart.com/en/unicode/category/Lt).
|
909
|
+
# * [Lu, Uppercase_Letter](https://www.compart.com/en/unicode/category/Lu).
|
576
910
|
#
|
577
|
-
# * `/pat/i` - Ignore case
|
578
|
-
# * `/pat/m` - Treat a newline as a character matched by `.`
|
579
|
-
# * `/pat/x` - Ignore whitespace and comments in the pattern
|
580
|
-
# * `/pat/o` - Perform `#{}` interpolation only once
|
581
911
|
#
|
912
|
+
# Marks:
|
582
913
|
#
|
583
|
-
# `
|
584
|
-
#
|
585
|
-
#
|
914
|
+
# * `M`, `Mark`: `Mc`, `Me`, or `Mn`.
|
915
|
+
# * [Mc, Spacing_Mark](https://www.compart.com/en/unicode/category/Mc).
|
916
|
+
# * [Me, Enclosing_Mark](https://www.compart.com/en/unicode/category/Me).
|
917
|
+
# * [Mn, Nonapacing_Mark](https://www.compart.com/en/unicode/category/Mn).
|
586
918
|
#
|
587
|
-
# /a(?i:b)c/.match('aBc') #=> #<MatchData "aBc">
|
588
|
-
# /a(?-i:b)c/i.match('ABC') #=> nil
|
589
919
|
#
|
590
|
-
#
|
591
|
-
# pattern:
|
920
|
+
# Numbers:
|
592
921
|
#
|
593
|
-
#
|
922
|
+
# * `N`, `Number`: `Nd`, `Nl`, or `No`.
|
923
|
+
# * [Nd, Decimal_Number](https://www.compart.com/en/unicode/category/Nd).
|
924
|
+
# * [Nl, Letter_Number](https://www.compart.com/en/unicode/category/Nl).
|
925
|
+
# * [No, Other_Number](https://www.compart.com/en/unicode/category/No).
|
594
926
|
#
|
595
|
-
# Options may also be used with `Regexp.new`:
|
596
927
|
#
|
597
|
-
#
|
598
|
-
# Regexp.new("abc", Regexp::MULTILINE) #=> /abc/m
|
599
|
-
# Regexp.new("abc # Comment", Regexp::EXTENDED) #=> /abc # Comment/x
|
600
|
-
# Regexp.new("abc", Regexp::IGNORECASE | Regexp::MULTILINE) #=> /abc/mi
|
928
|
+
# Punctation:
|
601
929
|
#
|
602
|
-
#
|
603
|
-
#
|
604
|
-
#
|
605
|
-
#
|
930
|
+
# * `P`, `Punctuation`: `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`.
|
931
|
+
# * [Pc,
|
932
|
+
# Connector_Punctuation](https://www.compart.com/en/unicode/category/Pc).
|
933
|
+
# * [Pd, Dash_Punctuation](https://www.compart.com/en/unicode/category/Pd).
|
934
|
+
# * [Pe, Close_Punctuation](https://www.compart.com/en/unicode/category/Pe).
|
935
|
+
# * [Pf, Final_Punctuation](https://www.compart.com/en/unicode/category/Pf).
|
936
|
+
# * [Pi, Initial_Punctuation](https://www.compart.com/en/unicode/category/Pi).
|
937
|
+
# * [Po, Other_Punctuation](https://www.compart.com/en/unicode/category/Po).
|
938
|
+
# * [Ps, Open_Punctuation](https://www.compart.com/en/unicode/category/Ps).
|
606
939
|
#
|
607
|
-
#
|
940
|
+
# * `S`, `Symbol`: `Sc`, `Sk`, `Sm`, or `So`.
|
941
|
+
# * [Sc, Currency_Symbol](https://www.compart.com/en/unicode/category/Sc).
|
942
|
+
# * [Sk, Modifier_Symbol](https://www.compart.com/en/unicode/category/Sk).
|
943
|
+
# * [Sm, Math_Symbol](https://www.compart.com/en/unicode/category/Sm).
|
944
|
+
# * [So, Other_Symbol](https://www.compart.com/en/unicode/category/So).
|
608
945
|
#
|
609
|
-
#
|
610
|
-
#
|
611
|
-
#
|
612
|
-
#
|
946
|
+
# * `Z`, `Separator`: `Zl`, `Zp`, or `Zs`.
|
947
|
+
# * [Zl, Line_Separator](https://www.compart.com/en/unicode/category/Zl).
|
948
|
+
# * [Zp, Paragraph_Separator](https://www.compart.com/en/unicode/category/Zp).
|
949
|
+
# * [Zs, Space_Separator](https://www.compart.com/en/unicode/category/Zs).
|
613
950
|
#
|
614
|
-
#
|
951
|
+
# * `C`, `Other`: `Cc`, `Cf`, `Cn`, `Co`, or `Cs`.
|
952
|
+
# * [Cc, Control](https://www.compart.com/en/unicode/category/Cc).
|
953
|
+
# * [Cf, Format](https://www.compart.com/en/unicode/category/Cf).
|
954
|
+
# * [Cn, Unassigned](https://www.compart.com/en/unicode/category/Cn).
|
955
|
+
# * [Co, Private_Use](https://www.compart.com/en/unicode/category/Co).
|
956
|
+
# * [Cs, Surrogate](https://www.compart.com/en/unicode/category/Cs).
|
615
957
|
#
|
616
|
-
# float_pat = /\A
|
617
|
-
# [[:digit:]]+ # 1 or more digits before the decimal point
|
618
|
-
# (\. # Decimal point
|
619
|
-
# [[:digit:]]+ # 1 or more digits after the decimal point
|
620
|
-
# )? # The decimal point and following digits are optional
|
621
|
-
# \Z/x
|
622
|
-
# float_pat.match('3.14') #=> #<MatchData "3.14" 1:".14">
|
623
958
|
#
|
624
|
-
#
|
959
|
+
# #### Unicode Scripts and Blocks
|
625
960
|
#
|
626
|
-
#
|
627
|
-
# * Use escaped whitespace such as `\ `, i.e. a space preceded by a backslash.
|
628
|
-
# * Use a character class such as `[ ]`.
|
961
|
+
# Among the Unicode properties are:
|
629
962
|
#
|
963
|
+
# * [Unicode scripts](https://en.wikipedia.org/wiki/Script_(Unicode)); see
|
964
|
+
# [supported scripts](https://www.unicode.org/standard/supported.html).
|
965
|
+
# * [Unicode blocks](https://en.wikipedia.org/wiki/Unicode_block); see
|
966
|
+
# [supported blocks](http://www.unicode.org/Public/UNIDATA/Blocks.txt).
|
630
967
|
#
|
631
|
-
# Comments can be included in a non-`x` pattern with the `(?#`*comment*`)`
|
632
|
-
# construct, where *comment* is arbitrary text ignored by the regexp engine.
|
633
968
|
#
|
634
|
-
#
|
969
|
+
# ### POSIX Bracket Expressions
|
635
970
|
#
|
636
|
-
#
|
971
|
+
# A POSIX *bracket expression* is also similar to a character class. These
|
972
|
+
# expressions provide a portable alternative to the above, with the added
|
973
|
+
# benefit of encompassing non-ASCII characters:
|
637
974
|
#
|
638
|
-
#
|
639
|
-
#
|
975
|
+
# * `/\d/` matches only ASCII decimal digits `0` through `9`.
|
976
|
+
# * `/[[:digit:]]/` matches any character in the Unicode `Decimal Number`
|
977
|
+
# (`Nd`) category; see below.
|
640
978
|
#
|
641
|
-
# * `/`*pat*`/u` - UTF-8
|
642
|
-
# * `/`*pat*`/e` - EUC-JP
|
643
|
-
# * `/`*pat*`/s` - Windows-31J
|
644
|
-
# * `/`*pat*`/n` - ASCII-8BIT
|
645
979
|
#
|
980
|
+
# The POSIX bracket expressions:
|
646
981
|
#
|
647
|
-
#
|
648
|
-
#
|
649
|
-
# ASCII-compatible.
|
982
|
+
# * `/[[:digit:]]/`: Matches a [Unicode
|
983
|
+
# digit](https://www.compart.com/en/unicode/category/Nd):
|
650
984
|
#
|
651
|
-
#
|
652
|
-
#
|
985
|
+
# /[[:digit:]]/.match('9') # => #<MatchData "9">
|
986
|
+
# /[[:digit:]]/.match("\u1fbf9") # => #<MatchData "9">
|
987
|
+
#
|
988
|
+
# * `/[[:xdigit:]]/`: Matches a digit allowed in a hexadecimal number;
|
989
|
+
# equivalent to `[0-9a-fA-F]`.
|
990
|
+
#
|
991
|
+
# * `/[[:upper:]]/`: Matches a [Unicode uppercase
|
992
|
+
# letter](https://www.compart.com/en/unicode/category/Lu):
|
993
|
+
#
|
994
|
+
# /[[:upper:]]/.match('A') # => #<MatchData "A">
|
995
|
+
# /[[:upper:]]/.match("\u00c6") # => #<MatchData "Æ">
|
996
|
+
#
|
997
|
+
# * `/[[:lower:]]/`: Matches a [Unicode lowercase
|
998
|
+
# letter](https://www.compart.com/en/unicode/category/Ll):
|
999
|
+
#
|
1000
|
+
# /[[:lower:]]/.match('a') # => #<MatchData "a">
|
1001
|
+
# /[[:lower:]]/.match("\u01fd") # => #<MatchData "ǽ">
|
1002
|
+
#
|
1003
|
+
# * `/[[:alpha:]]/`: Matches `/[[:upper:]]/` or `/[[:lower:]]/`.
|
1004
|
+
#
|
1005
|
+
# * `/[[:alnum:]]/`: Matches `/[[:alpha:]]/` or `/[[:digit:]]/`.
|
1006
|
+
#
|
1007
|
+
# * `/[[:space:]]/`: Matches [Unicode space
|
1008
|
+
# character](https://www.compart.com/en/unicode/category/Zs):
|
1009
|
+
#
|
1010
|
+
# /[[:space:]]/.match(' ') # => #<MatchData " ">
|
1011
|
+
# /[[:space:]]/.match("\u2005") # => #<MatchData " ">
|
1012
|
+
#
|
1013
|
+
# * `/[[:blank:]]/`: Matches `/[[:space:]]/` or tab character:
|
1014
|
+
#
|
1015
|
+
# /[[:blank:]]/.match(' ') # => #<MatchData " ">
|
1016
|
+
# /[[:blank:]]/.match("\u2005") # => #<MatchData " ">
|
1017
|
+
# /[[:blank:]]/.match("\t") # => #<MatchData "\t">
|
1018
|
+
#
|
1019
|
+
# * `/[[:cntrl:]]/`: Matches [Unicode control
|
1020
|
+
# character](https://www.compart.com/en/unicode/category/Cc):
|
1021
|
+
#
|
1022
|
+
# /[[:cntrl:]]/.match("\u0000") # => #<MatchData "\u0000">
|
1023
|
+
# /[[:cntrl:]]/.match("\u009f") # => #<MatchData "\u009F">
|
1024
|
+
#
|
1025
|
+
# * `/[[:graph:]]/`: Matches any character except `/[[:space:]]/` or
|
1026
|
+
# `/[[:cntrl:]]/`.
|
1027
|
+
#
|
1028
|
+
# * `/[[:print:]]/`: Matches `/[[:graph:]]/` or space character.
|
1029
|
+
#
|
1030
|
+
# * `/[[:punct:]]/`: Matches any (Unicode punctuation
|
1031
|
+
# character}[https://www.compart.com/en/unicode/category/Po]:
|
1032
|
+
#
|
1033
|
+
#
|
1034
|
+
# Ruby also supports these (non-POSIX) bracket expressions:
|
1035
|
+
#
|
1036
|
+
# * `/[[:ascii:]]/`: Matches a character in the ASCII character set.
|
1037
|
+
# * `/[[:word:]]/`: Matches a character in one of these Unicode character
|
1038
|
+
# categories or having one of these Unicode properties:
|
1039
|
+
#
|
1040
|
+
# * Unicode categories:
|
1041
|
+
# * `Mark` (`M`).
|
1042
|
+
# * `Decimal Number` (`Nd`)
|
1043
|
+
# * `Connector Punctuation` (`Pc`).
|
1044
|
+
#
|
1045
|
+
#
|
1046
|
+
# * Unicode properties:
|
1047
|
+
# * `Alpha`
|
1048
|
+
# * `Join_Control`
|
653
1049
|
#
|
654
|
-
# The `Regexp#fixed_encoding?` predicate indicates whether the regexp has a
|
655
|
-
# *fixed* encoding, that is one incompatible with ASCII. A regexp's encoding can
|
656
|
-
# be explicitly fixed by supplying `Regexp::FIXEDENCODING` as the second
|
657
|
-
# argument of `Regexp.new`:
|
658
1050
|
#
|
659
|
-
# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING)
|
660
|
-
# r =~ "a\u3042"
|
661
|
-
# # raises Encoding::CompatibilityError: incompatible encoding regexp match
|
662
|
-
# # (ISO-8859-1 regexp with UTF-8 string)
|
663
1051
|
#
|
664
|
-
# ## Regexp Global Variables
|
665
1052
|
#
|
666
|
-
#
|
1053
|
+
# ### Comments
|
667
1054
|
#
|
668
|
-
#
|
669
|
-
# *
|
670
|
-
#
|
671
|
-
# * `$'` contains string after match;
|
672
|
-
# * `$1`, `$2` and so on contain text matching first, second, etc capture
|
673
|
-
# group;
|
674
|
-
# * `$+` contains last capture group.
|
1055
|
+
# A comment may be included in a regexp pattern using the `(?#`*comment*`)`
|
1056
|
+
# construct, where *comment* is a substring that is to be ignored. arbitrary
|
1057
|
+
# text ignored by the regexp engine:
|
675
1058
|
#
|
1059
|
+
# /foo(?#Ignore me)bar/.match('foobar') # => #<MatchData "foobar">
|
1060
|
+
#
|
1061
|
+
# The comment may not include an unescaped terminator character.
|
1062
|
+
#
|
1063
|
+
# See also [Extended Mode](rdoc-ref:Regexp@Extended+Mode).
|
1064
|
+
#
|
1065
|
+
# ## Modes
|
1066
|
+
#
|
1067
|
+
# Each of these modifiers sets a mode for the regexp:
|
1068
|
+
#
|
1069
|
+
# * `i`: `/*pattern*/i` sets [Case-Insensitive
|
1070
|
+
# Mode](rdoc-ref:Regexp@Case-Insensitive+Mode).
|
1071
|
+
# * `m`: `/*pattern*/m` sets [Multiline Mode](rdoc-ref:Regexp@Multiline+Mode).
|
1072
|
+
# * `x`: `/*pattern*/x` sets [Extended Mode](rdoc-ref:Regexp@Extended+Mode).
|
1073
|
+
# * `o`: `/*pattern*/o` sets [Interpolation
|
1074
|
+
# Mode](rdoc-ref:Regexp@Interpolation+Mode).
|
1075
|
+
#
|
1076
|
+
#
|
1077
|
+
# Any, all, or none of these may be applied.
|
1078
|
+
#
|
1079
|
+
# Modifiers `i`, `m`, and `x` may be applied to subexpressions:
|
1080
|
+
#
|
1081
|
+
# * `(?*modifier*)` turns the mode "on" for ensuing subexpressions
|
1082
|
+
# * `(?-*modifier*)` turns the mode "off" for ensuing subexpressions
|
1083
|
+
# * `(?*modifier*:*subexp*)` turns the mode "on" for *subexp* within the group
|
1084
|
+
# * `(?-*modifier*:*subexp*)` turns the mode "off" for *subexp* within the
|
1085
|
+
# group
|
1086
|
+
#
|
1087
|
+
#
|
1088
|
+
# Example:
|
1089
|
+
#
|
1090
|
+
# re = /(?i)te(?-i)st/
|
1091
|
+
# re.match('test') # => #<MatchData "test">
|
1092
|
+
# re.match('TEst') # => #<MatchData "TEst">
|
1093
|
+
# re.match('TEST') # => nil
|
1094
|
+
# re.match('teST') # => nil
|
1095
|
+
#
|
1096
|
+
# re = /t(?i:e)st/
|
1097
|
+
# re.match('test') # => #<MatchData "test">
|
1098
|
+
# re.match('tEst') # => #<MatchData "tEst">
|
1099
|
+
# re.match('tEST') # => nil
|
1100
|
+
#
|
1101
|
+
# Method Regexp#options returns an integer whose value showing the settings for
|
1102
|
+
# case-insensitivity mode, multiline mode, and extended mode.
|
1103
|
+
#
|
1104
|
+
# ### Case-Insensitive Mode
|
1105
|
+
#
|
1106
|
+
# By default, a regexp is case-sensitive:
|
1107
|
+
#
|
1108
|
+
# /foo/.match('FOO') # => nil
|
1109
|
+
#
|
1110
|
+
# Modifier `i` enables case-insensitive mode:
|
1111
|
+
#
|
1112
|
+
# /foo/i.match('FOO')
|
1113
|
+
# # => #<MatchData "FOO">
|
1114
|
+
#
|
1115
|
+
# Method Regexp#casefold? returns whether the mode is case-insensitive.
|
1116
|
+
#
|
1117
|
+
# ### Multiline Mode
|
1118
|
+
#
|
1119
|
+
# The multiline-mode in Ruby is what is commonly called a "dot-all mode":
|
1120
|
+
#
|
1121
|
+
# * Without the `m` modifier, the subexpression `.` does not match newlines:
|
1122
|
+
#
|
1123
|
+
# /a.c/.match("a\nc") # => nil
|
1124
|
+
#
|
1125
|
+
# * With the modifier, it does match:
|
1126
|
+
#
|
1127
|
+
# /a.c/m.match("a\nc") # => #<MatchData "a\nc">
|
1128
|
+
#
|
1129
|
+
#
|
1130
|
+
# Unlike other languages, the modifier `m` does not affect the anchors `^` and
|
1131
|
+
# `$`. These anchors always match at line-boundaries in Ruby.
|
1132
|
+
#
|
1133
|
+
# ### Extended Mode
|
1134
|
+
#
|
1135
|
+
# Modifier `x` enables extended mode, which means that:
|
1136
|
+
#
|
1137
|
+
# * Literal white space in the pattern is to be ignored.
|
1138
|
+
# * Character `#` marks the remainder of its containing line as a comment,
|
1139
|
+
# which is also to be ignored for matching purposes.
|
1140
|
+
#
|
1141
|
+
#
|
1142
|
+
# In extended mode, whitespace and comments may be used to form a
|
1143
|
+
# self-documented regexp.
|
1144
|
+
#
|
1145
|
+
# Regexp not in extended mode (matches some Roman numerals):
|
1146
|
+
#
|
1147
|
+
# pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
|
1148
|
+
# re = /#{pattern}/
|
1149
|
+
# re.match('MCMXLIII') # => #<MatchData "MCMXLIII" 1:"CM" 2:"XL" 3:"III">
|
1150
|
+
#
|
1151
|
+
# Regexp in extended mode:
|
1152
|
+
#
|
1153
|
+
# pattern = <<-EOT
|
1154
|
+
# ^ # beginning of string
|
1155
|
+
# M{0,3} # thousands - 0 to 3 Ms
|
1156
|
+
# (CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 Cs),
|
1157
|
+
# # or 500-800 (D, followed by 0 to 3 Cs)
|
1158
|
+
# (XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 Xs),
|
1159
|
+
# # or 50-80 (L, followed by 0 to 3 Xs)
|
1160
|
+
# (IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 Is),
|
1161
|
+
# # or 5-8 (V, followed by 0 to 3 Is)
|
1162
|
+
# $ # end of string
|
1163
|
+
# EOT
|
1164
|
+
# re = /#{pattern}/x
|
1165
|
+
# re.match('MCMXLIII') # => #<MatchData "MCMXLIII" 1:"CM" 2:"XL" 3:"III">
|
1166
|
+
#
|
1167
|
+
# ### Interpolation Mode
|
1168
|
+
#
|
1169
|
+
# Modifier `o` means that the first time a literal regexp with interpolations is
|
1170
|
+
# encountered, the generated Regexp object is saved and used for all future
|
1171
|
+
# evaluations of that literal regexp. Without modifier `o`, the generated Regexp
|
1172
|
+
# is not saved, so each evaluation of the literal regexp generates a new Regexp
|
1173
|
+
# object.
|
1174
|
+
#
|
1175
|
+
# Without modifier `o`:
|
1176
|
+
#
|
1177
|
+
# def letters; sleep 5; /[A-Z][a-z]/; end
|
1178
|
+
# words = %w[abc def xyz]
|
1179
|
+
# start = Time.now
|
1180
|
+
# words.each {|word| word.match(/\A[#{letters}]+\z/) }
|
1181
|
+
# Time.now - start # => 15.0174892
|
1182
|
+
#
|
1183
|
+
# With modifier `o`:
|
1184
|
+
#
|
1185
|
+
# start = Time.now
|
1186
|
+
# words.each {|word| word.match(/\A[#{letters}]+\z/o) }
|
1187
|
+
# Time.now - start # => 5.0010866
|
1188
|
+
#
|
1189
|
+
# Note that if the literal regexp does not have interpolations, the `o` behavior
|
1190
|
+
# is the default.
|
1191
|
+
#
|
1192
|
+
# ## Encodings
|
1193
|
+
#
|
1194
|
+
# By default, a regexp with only US-ASCII characters has US-ASCII encoding:
|
1195
|
+
#
|
1196
|
+
# re = /foo/
|
1197
|
+
# re.source.encoding # => #<Encoding:US-ASCII>
|
1198
|
+
# re.encoding # => #<Encoding:US-ASCII>
|
1199
|
+
#
|
1200
|
+
# A regular expression containing non-US-ASCII characters is assumed to use the
|
1201
|
+
# source encoding. This can be overridden with one of the following modifiers.
|
1202
|
+
#
|
1203
|
+
# * `/*pat*/n`: US-ASCII if only containing US-ASCII characters, otherwise
|
1204
|
+
# ASCII-8BIT:
|
1205
|
+
#
|
1206
|
+
# /foo/n.encoding # => #<Encoding:US-ASCII>
|
1207
|
+
# /foo\xff/n.encoding # => #<Encoding:ASCII-8BIT>
|
1208
|
+
# /foo\x7f/n.encoding # => #<Encoding:US-ASCII>
|
1209
|
+
#
|
1210
|
+
# * `/*pat*/u`: UTF-8
|
1211
|
+
#
|
1212
|
+
# /foo/u.encoding # => #<Encoding:UTF-8>
|
1213
|
+
#
|
1214
|
+
# * `/*pat*/e`: EUC-JP
|
1215
|
+
#
|
1216
|
+
# /foo/e.encoding # => #<Encoding:EUC-JP>
|
1217
|
+
#
|
1218
|
+
# * `/*pat*/s`: Windows-31J
|
1219
|
+
#
|
1220
|
+
# /foo/s.encoding # => #<Encoding:Windows-31J>
|
1221
|
+
#
|
1222
|
+
#
|
1223
|
+
# A regexp can be matched against a target string when either:
|
1224
|
+
#
|
1225
|
+
# * They have the same encoding.
|
1226
|
+
# * The regexp's encoding is a fixed encoding and the string contains only
|
1227
|
+
# ASCII characters. Method Regexp#fixed_encoding? returns whether the regexp
|
1228
|
+
# has a *fixed* encoding.
|
1229
|
+
#
|
1230
|
+
#
|
1231
|
+
# If a match between incompatible encodings is attempted an
|
1232
|
+
# `Encoding::CompatibilityError` exception is raised.
|
676
1233
|
#
|
677
1234
|
# Example:
|
678
1235
|
#
|
679
|
-
#
|
680
|
-
#
|
681
|
-
#
|
1236
|
+
# re = eval("# encoding: ISO-8859-1\n/foo\\xff?/")
|
1237
|
+
# re.encoding # => #<Encoding:ISO-8859-1>
|
1238
|
+
# re =~ "foo".encode("UTF-8") # => 0
|
1239
|
+
# re =~ "foo\u0100" # Raises Encoding::CompatibilityError
|
1240
|
+
#
|
1241
|
+
# The encoding may be explicitly fixed by including Regexp::FIXEDENCODING in the
|
1242
|
+
# second argument for Regexp.new:
|
1243
|
+
#
|
1244
|
+
# # Regexp with encoding ISO-8859-1.
|
1245
|
+
# re = Regexp.new("a".force_encoding('iso-8859-1'), Regexp::FIXEDENCODING)
|
1246
|
+
# re.encoding # => #<Encoding:ISO-8859-1>
|
1247
|
+
# # Target string with encoding UTF-8.
|
1248
|
+
# s = "a\u3042"
|
1249
|
+
# s.encoding # => #<Encoding:UTF-8>
|
1250
|
+
# re.match(s) # Raises Encoding::CompatibilityError.
|
682
1251
|
#
|
683
|
-
#
|
684
|
-
# # same as m[0]
|
685
|
-
# $` #=> "hay"
|
686
|
-
# # same as m.pre_match
|
687
|
-
# $' #=> "k"
|
688
|
-
# # same as m.post_match
|
689
|
-
# $1 #=> "ta"
|
690
|
-
# # same as m[1]
|
691
|
-
# $2 #=> "c"
|
692
|
-
# # same as m[2]
|
693
|
-
# $3 #=> nil
|
694
|
-
# # no third group in pattern
|
695
|
-
# $+ #=> "c"
|
696
|
-
# # same as m[-1]
|
1252
|
+
# ## Timeouts
|
697
1253
|
#
|
698
|
-
#
|
1254
|
+
# When either a regexp source or a target string comes from untrusted input,
|
1255
|
+
# malicious values could become a denial-of-service attack; to prevent such an
|
1256
|
+
# attack, it is wise to set a timeout.
|
699
1257
|
#
|
700
|
-
#
|
1258
|
+
# Regexp has two timeout values:
|
701
1259
|
#
|
702
|
-
#
|
703
|
-
#
|
1260
|
+
# * A class default timeout, used for a regexp whose instance timeout is
|
1261
|
+
# `nil`; this default is initially `nil`, and may be set by method
|
1262
|
+
# Regexp.timeout=:
|
704
1263
|
#
|
705
|
-
#
|
1264
|
+
# Regexp.timeout # => nil
|
1265
|
+
# Regexp.timeout = 3.0
|
1266
|
+
# Regexp.timeout # => 3.0
|
706
1267
|
#
|
707
|
-
#
|
708
|
-
# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadaaaac"
|
1268
|
+
# * An instance timeout, which defaults to `nil` and may be set in Regexp.new:
|
709
1269
|
#
|
710
|
-
#
|
1270
|
+
# re = Regexp.new('foo', timeout: 5.0)
|
1271
|
+
# re.timeout # => 5.0
|
711
1272
|
#
|
712
|
-
# /(b|a)/ =~ s #=> 0
|
713
|
-
# /(b|a+)/ =~ s #=> 0
|
714
|
-
# /(b|a+)*/ =~ s #=> 0
|
715
1273
|
#
|
716
|
-
#
|
1274
|
+
# When regexp.timeout is `nil`, the timeout "falls through" to Regexp.timeout;
|
1275
|
+
# when regexp.timeout is non-`nil`, that value controls timing out:
|
717
1276
|
#
|
718
|
-
#
|
1277
|
+
# | regexp.timeout Value | Regexp.timeout Value | Result |
|
1278
|
+
# |----------------------|----------------------|-----------------------------|
|
1279
|
+
# | nil | nil | Never times out. |
|
1280
|
+
# | nil | Float | Times out in Float seconds. |
|
1281
|
+
# | Float | Any | Times out in Float seconds. |
|
719
1282
|
#
|
720
|
-
#
|
721
|
-
# `+` and an enclosing `*` with nothing to differentiate which is in control of
|
722
|
-
# any particular character. The nondeterminism that results produces
|
723
|
-
# super-linear performance. (Consult *Mastering Regular Expressions* (3rd ed.),
|
724
|
-
# pp 222, by *Jeffery Friedl*, for an in-depth analysis). This particular case
|
725
|
-
# can be fixed by use of atomic grouping, which prevents the unnecessary
|
726
|
-
# backtracking:
|
1283
|
+
# ## Optimization
|
727
1284
|
#
|
728
|
-
#
|
729
|
-
#
|
730
|
-
#
|
731
|
-
#
|
1285
|
+
# For certain values of the pattern and target string, matching time can grow
|
1286
|
+
# polynomially or exponentially in relation to the input size; the potential
|
1287
|
+
# vulnerability arising from this is the [regular expression
|
1288
|
+
# denial-of-service](https://en.wikipedia.org/wiki/ReDoS) (ReDoS) attack.
|
732
1289
|
#
|
733
|
-
#
|
734
|
-
#
|
1290
|
+
# Regexp matching can apply an optimization to prevent ReDoS attacks. When the
|
1291
|
+
# optimization is applied, matching time increases linearly (not polynomially or
|
1292
|
+
# exponentially) in relation to the input size, and a ReDoS attach is not
|
1293
|
+
# possible.
|
735
1294
|
#
|
736
|
-
#
|
737
|
-
# mandatory *a*s:
|
1295
|
+
# This optimization is applied if the pattern meets these criteria:
|
738
1296
|
#
|
739
|
-
#
|
1297
|
+
# * No backreferences.
|
1298
|
+
# * No subexpression calls.
|
1299
|
+
# * No nested lookaround anchors or atomic groups.
|
1300
|
+
# * No nested quantifiers with counting (i.e. no nested `{n}`, `{min,}`,
|
1301
|
+
# `{,max}`, or `{min,max}` style quantifiers)
|
740
1302
|
#
|
741
|
-
# The 29 optional *a*s match the string, but this prevents the 29 mandatory *a*s
|
742
|
-
# that follow from matching. Ruby must then backtrack repeatedly so as to
|
743
|
-
# satisfy as many of the optional matches as it can while still matching the
|
744
|
-
# mandatory 29. It is plain to us that none of the optional matches can succeed,
|
745
|
-
# but this fact unfortunately eludes Ruby.
|
746
1303
|
#
|
747
|
-
#
|
748
|
-
#
|
749
|
-
# optional *a*s, a range of optional *a*s can be matched all at once with
|
750
|
-
# *a{0,29}*:
|
1304
|
+
# You can use method Regexp.linear_time? to determine whether a pattern meets
|
1305
|
+
# these criteria:
|
751
1306
|
#
|
752
|
-
# Regexp.
|
1307
|
+
# Regexp.linear_time?(/a*/) # => true
|
1308
|
+
# Regexp.linear_time?('a*') # => true
|
1309
|
+
# Regexp.linear_time?(/(a*)\1/) # => false
|
753
1310
|
#
|
754
|
-
#
|
1311
|
+
# However, an untrusted source may not be safe even if the method returns
|
1312
|
+
# `true`, because the optimization uses memoization (which may invoke large
|
1313
|
+
# memory consumption).
|
755
1314
|
#
|
756
|
-
#
|
757
|
-
# process-global configuration of timeout for Regexp matching.
|
1315
|
+
# ## References
|
758
1316
|
#
|
759
|
-
#
|
760
|
-
# s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
|
761
|
-
# /(b|a+)*c/ =~ s #=> This raises an exception in three seconds
|
1317
|
+
# Read (online PDF books):
|
762
1318
|
#
|
763
|
-
#
|
1319
|
+
# * [Mastering Regular
|
1320
|
+
# Expressions](https://ia902508.us.archive.org/10/items/allitebooks-02/Maste
|
1321
|
+
# ring%20Regular%20Expressions%2C%203rd%20Edition.pdf) by Jeffrey E.F.
|
1322
|
+
# Friedl.
|
1323
|
+
# * [Regular Expressions
|
1324
|
+
# Cookbook](https://doc.lagout.org/programmation/Regular%20Expressions/Regul
|
1325
|
+
# ar%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Program
|
1326
|
+
# ming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012-
|
1327
|
+
# 09-06%5D.pdf) by Jan Goyvaerts & Steven Levithan.
|
764
1328
|
#
|
765
|
-
# re = Regexp.new("(b|a+)*c", timeout: 3)
|
766
|
-
# s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
|
767
|
-
# /(b|a+)*c/ =~ s #=> This raises an exception in three seconds
|
768
1329
|
#
|
769
|
-
#
|
770
|
-
#
|
771
|
-
#
|
772
|
-
# timeout is not set by default because an appropriate limit highly depends on
|
773
|
-
# an application requirement and context.
|
1330
|
+
# Explore, test (interactive online editor):
|
1331
|
+
#
|
1332
|
+
# * [Rubular](https://rubular.com/).
|
774
1333
|
#
|
775
1334
|
class Regexp
|
776
1335
|
# <!--
|
@@ -792,7 +1351,7 @@ class Regexp
|
|
792
1351
|
# Regexp.new('foo', 'i') # => /foo/i
|
793
1352
|
# Regexp.new('foo', 'im') # => /foo/im
|
794
1353
|
#
|
795
|
-
# * The
|
1354
|
+
# * The bit-wise OR of one or more of the constants Regexp::EXTENDED,
|
796
1355
|
# Regexp::IGNORECASE, Regexp::MULTILINE, and Regexp::NOENCODING:
|
797
1356
|
#
|
798
1357
|
# Regexp.new('foo', Regexp::IGNORECASE) # => /foo/i
|
@@ -803,6 +1362,7 @@ class Regexp
|
|
803
1362
|
# Regexp.new('foo', flags) # => /foo/mix
|
804
1363
|
#
|
805
1364
|
# * `nil` or `false`, which is ignored.
|
1365
|
+
# * Any other truthy value, in which case the regexp will be case-insensitive.
|
806
1366
|
#
|
807
1367
|
#
|
808
1368
|
# If optional keyword argument `timeout` is given, its float value overrides the
|
@@ -820,8 +1380,6 @@ class Regexp
|
|
820
1380
|
# r3 = Regexp.new(r, timeout: 3.14) # => /foo/m
|
821
1381
|
# r3.timeout # => 3.14
|
822
1382
|
#
|
823
|
-
# Regexp.compile is an alias for Regexp.new.
|
824
|
-
#
|
825
1383
|
def initialize: (String string, ?String | Integer | nil | false options, ?timeout: Float?) -> Object
|
826
1384
|
| (Regexp regexp, ?timeout: Float?) -> void
|
827
1385
|
|
@@ -847,8 +1405,6 @@ class Regexp
|
|
847
1405
|
# r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./
|
848
1406
|
# r.match(s) # => #<MatchData "\\\\\\*\\?\\{\\}\\.">
|
849
1407
|
#
|
850
|
-
# Regexp.quote is an alias for Regexp.escape.
|
851
|
-
#
|
852
1408
|
def self.escape: (interned str) -> String
|
853
1409
|
|
854
1410
|
# <!--
|
@@ -858,8 +1414,8 @@ class Regexp
|
|
858
1414
|
# - Regexp.last_match(name) -> string or nil
|
859
1415
|
# -->
|
860
1416
|
# With no argument, returns the value of `$!`, which is the result of the most
|
861
|
-
# recent pattern match (see [Regexp
|
862
|
-
#
|
1417
|
+
# recent pattern match (see [Regexp global
|
1418
|
+
# variables](rdoc-ref:Regexp@Global+Variables)):
|
863
1419
|
#
|
864
1420
|
# /c(.)t/ =~ 'cat' # => 0
|
865
1421
|
# Regexp.last_match # => #<MatchData "cat" 1:"a">
|
@@ -926,8 +1482,6 @@ class Regexp
|
|
926
1482
|
# r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./
|
927
1483
|
# r.match(s) # => #<MatchData "\\\\\\*\\?\\{\\}\\.">
|
928
1484
|
#
|
929
|
-
# Regexp.quote is an alias for Regexp.escape.
|
930
|
-
#
|
931
1485
|
def self.quote: (interned str) -> String
|
932
1486
|
|
933
1487
|
# <!--
|
@@ -1019,8 +1573,6 @@ class Regexp
|
|
1019
1573
|
# /foo/ == Regexp.new('food') # => false
|
1020
1574
|
# /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false
|
1021
1575
|
#
|
1022
|
-
# Regexp#eql? is an alias for Regexp#==.
|
1023
|
-
#
|
1024
1576
|
def ==: (untyped other) -> bool
|
1025
1577
|
|
1026
1578
|
# <!--
|
@@ -1048,8 +1600,8 @@ class Regexp
|
|
1048
1600
|
# - regexp =~ string -> integer or nil
|
1049
1601
|
# -->
|
1050
1602
|
# Returns the integer index (in characters) of the first match for `self` and
|
1051
|
-
# `string`, or `nil` if none; also sets the [rdoc-ref:Regexp
|
1052
|
-
#
|
1603
|
+
# `string`, or `nil` if none; also sets the [rdoc-ref:Regexp global
|
1604
|
+
# variables](rdoc-ref:Regexp@Global+Variables):
|
1053
1605
|
#
|
1054
1606
|
# /at/ =~ 'input data' # => 7
|
1055
1607
|
# $~ # => #<MatchData "at">
|
@@ -1062,7 +1614,7 @@ class Regexp
|
|
1062
1614
|
# * Is a regexp literal; see [Regexp
|
1063
1615
|
# Literals](rdoc-ref:literals.rdoc@Regexp+Literals).
|
1064
1616
|
# * Does not contain interpolations; see [Regexp
|
1065
|
-
#
|
1617
|
+
# interpolation](rdoc-ref:Regexp@Interpolation+Mode).
|
1066
1618
|
# * Is at the left of the expression.
|
1067
1619
|
#
|
1068
1620
|
#
|
@@ -1131,8 +1683,6 @@ class Regexp
|
|
1131
1683
|
# /foo/ == Regexp.new('food') # => false
|
1132
1684
|
# /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false
|
1133
1685
|
#
|
1134
|
-
# Regexp#eql? is an alias for Regexp#==.
|
1135
|
-
#
|
1136
1686
|
def eql?: (untyped other) -> bool
|
1137
1687
|
|
1138
1688
|
# <!--
|
@@ -1296,8 +1846,8 @@ class Regexp
|
|
1296
1846
|
# /foo/mix.options # => 7
|
1297
1847
|
#
|
1298
1848
|
# Note that additional bits may be set in the returned integer; these are
|
1299
|
-
# maintained internally
|
1300
|
-
#
|
1849
|
+
# maintained internally in `self`, are ignored if passed to Regexp.new, and may
|
1850
|
+
# be ignored by the caller:
|
1301
1851
|
#
|
1302
1852
|
# Returns the set of bits corresponding to the options used when creating this
|
1303
1853
|
# regexp (see Regexp::new for details). Note that additional bits may be set in
|
@@ -1339,7 +1889,8 @@ class Regexp
|
|
1339
1889
|
# s0 = r0.to_s # => "(?ix-m:ab+c)"
|
1340
1890
|
#
|
1341
1891
|
# The returned string may be used as an argument to Regexp.new, or as
|
1342
|
-
# interpolated text for a [Regexp
|
1892
|
+
# interpolated text for a [Regexp
|
1893
|
+
# interpolation](rdoc-ref:Regexp@Interpolation+Mode):
|
1343
1894
|
#
|
1344
1895
|
# r1 = Regexp.new(s0) # => /(?ix-m:ab+c)/
|
1345
1896
|
# r2 = /#{s0}/ # => /(?ix-m:ab+c)/
|