rbs 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/ruby.yml +28 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +15 -0
- data/BSDL +22 -0
- data/CHANGELOG.md +9 -0
- data/COPYING +56 -0
- data/Gemfile +6 -0
- data/README.md +93 -0
- data/Rakefile +142 -0
- data/bin/annotate-with-rdoc +157 -0
- data/bin/console +14 -0
- data/bin/query-rdoc +103 -0
- data/bin/setup +10 -0
- data/bin/sort +89 -0
- data/bin/test_runner.rb +16 -0
- data/docs/CONTRIBUTING.md +97 -0
- data/docs/sigs.md +148 -0
- data/docs/stdlib.md +152 -0
- data/docs/syntax.md +528 -0
- data/exe/rbs +7 -0
- data/lib/rbs.rb +64 -0
- data/lib/rbs/ast/annotation.rb +27 -0
- data/lib/rbs/ast/comment.rb +27 -0
- data/lib/rbs/ast/declarations.rb +395 -0
- data/lib/rbs/ast/members.rb +362 -0
- data/lib/rbs/buffer.rb +50 -0
- data/lib/rbs/builtin_names.rb +55 -0
- data/lib/rbs/cli.rb +558 -0
- data/lib/rbs/constant.rb +26 -0
- data/lib/rbs/constant_table.rb +150 -0
- data/lib/rbs/definition.rb +170 -0
- data/lib/rbs/definition_builder.rb +919 -0
- data/lib/rbs/environment.rb +281 -0
- data/lib/rbs/environment_loader.rb +136 -0
- data/lib/rbs/environment_walker.rb +124 -0
- data/lib/rbs/errors.rb +187 -0
- data/lib/rbs/location.rb +102 -0
- data/lib/rbs/method_type.rb +123 -0
- data/lib/rbs/namespace.rb +91 -0
- data/lib/rbs/parser.y +1344 -0
- data/lib/rbs/prototype/rb.rb +553 -0
- data/lib/rbs/prototype/rbi.rb +587 -0
- data/lib/rbs/prototype/runtime.rb +381 -0
- data/lib/rbs/substitution.rb +46 -0
- data/lib/rbs/test.rb +26 -0
- data/lib/rbs/test/errors.rb +61 -0
- data/lib/rbs/test/hook.rb +294 -0
- data/lib/rbs/test/setup.rb +58 -0
- data/lib/rbs/test/spy.rb +325 -0
- data/lib/rbs/test/test_helper.rb +183 -0
- data/lib/rbs/test/type_check.rb +254 -0
- data/lib/rbs/type_name.rb +70 -0
- data/lib/rbs/types.rb +936 -0
- data/lib/rbs/variance_calculator.rb +138 -0
- data/lib/rbs/vendorer.rb +47 -0
- data/lib/rbs/version.rb +3 -0
- data/lib/rbs/writer.rb +269 -0
- data/lib/ruby/signature.rb +7 -0
- data/rbs.gemspec +46 -0
- data/stdlib/abbrev/abbrev.rbs +60 -0
- data/stdlib/base64/base64.rbs +71 -0
- data/stdlib/benchmark/benchmark.rbs +372 -0
- data/stdlib/builtin/array.rbs +1997 -0
- data/stdlib/builtin/basic_object.rbs +280 -0
- data/stdlib/builtin/binding.rbs +177 -0
- data/stdlib/builtin/builtin.rbs +45 -0
- data/stdlib/builtin/class.rbs +145 -0
- data/stdlib/builtin/comparable.rbs +116 -0
- data/stdlib/builtin/complex.rbs +400 -0
- data/stdlib/builtin/constants.rbs +37 -0
- data/stdlib/builtin/data.rbs +5 -0
- data/stdlib/builtin/deprecated.rbs +2 -0
- data/stdlib/builtin/dir.rbs +413 -0
- data/stdlib/builtin/encoding.rbs +607 -0
- data/stdlib/builtin/enumerable.rbs +404 -0
- data/stdlib/builtin/enumerator.rbs +260 -0
- data/stdlib/builtin/errno.rbs +781 -0
- data/stdlib/builtin/errors.rbs +582 -0
- data/stdlib/builtin/exception.rbs +194 -0
- data/stdlib/builtin/false_class.rbs +40 -0
- data/stdlib/builtin/fiber.rbs +68 -0
- data/stdlib/builtin/fiber_error.rbs +12 -0
- data/stdlib/builtin/file.rbs +1076 -0
- data/stdlib/builtin/file_test.rbs +59 -0
- data/stdlib/builtin/float.rbs +696 -0
- data/stdlib/builtin/gc.rbs +243 -0
- data/stdlib/builtin/hash.rbs +1029 -0
- data/stdlib/builtin/integer.rbs +707 -0
- data/stdlib/builtin/io.rbs +683 -0
- data/stdlib/builtin/kernel.rbs +576 -0
- data/stdlib/builtin/marshal.rbs +161 -0
- data/stdlib/builtin/match_data.rbs +271 -0
- data/stdlib/builtin/math.rbs +369 -0
- data/stdlib/builtin/method.rbs +185 -0
- data/stdlib/builtin/module.rbs +1104 -0
- data/stdlib/builtin/nil_class.rbs +82 -0
- data/stdlib/builtin/numeric.rbs +409 -0
- data/stdlib/builtin/object.rbs +824 -0
- data/stdlib/builtin/proc.rbs +429 -0
- data/stdlib/builtin/process.rbs +1227 -0
- data/stdlib/builtin/random.rbs +267 -0
- data/stdlib/builtin/range.rbs +226 -0
- data/stdlib/builtin/rational.rbs +424 -0
- data/stdlib/builtin/rb_config.rbs +57 -0
- data/stdlib/builtin/regexp.rbs +1083 -0
- data/stdlib/builtin/ruby_vm.rbs +14 -0
- data/stdlib/builtin/signal.rbs +55 -0
- data/stdlib/builtin/string.rbs +1901 -0
- data/stdlib/builtin/string_io.rbs +284 -0
- data/stdlib/builtin/struct.rbs +40 -0
- data/stdlib/builtin/symbol.rbs +228 -0
- data/stdlib/builtin/thread.rbs +1108 -0
- data/stdlib/builtin/thread_group.rbs +23 -0
- data/stdlib/builtin/time.rbs +1047 -0
- data/stdlib/builtin/trace_point.rbs +290 -0
- data/stdlib/builtin/true_class.rbs +46 -0
- data/stdlib/builtin/unbound_method.rbs +153 -0
- data/stdlib/builtin/warning.rbs +17 -0
- data/stdlib/coverage/coverage.rbs +62 -0
- data/stdlib/csv/csv.rbs +773 -0
- data/stdlib/erb/erb.rbs +392 -0
- data/stdlib/find/find.rbs +40 -0
- data/stdlib/ipaddr/ipaddr.rbs +247 -0
- data/stdlib/json/json.rbs +335 -0
- data/stdlib/pathname/pathname.rbs +1093 -0
- data/stdlib/prime/integer-extension.rbs +23 -0
- data/stdlib/prime/prime.rbs +188 -0
- data/stdlib/securerandom/securerandom.rbs +9 -0
- data/stdlib/set/set.rbs +301 -0
- data/stdlib/tmpdir/tmpdir.rbs +53 -0
- metadata +292 -0
@@ -0,0 +1,1083 @@
|
|
1
|
+
# A Regexp holds a regular expression, used to match a pattern against strings.
|
2
|
+
# Regexps are created using the `/.../` and `%r{...}` literals, and by the
|
3
|
+
# Regexp::new constructor.
|
4
|
+
#
|
5
|
+
# Regular expressions (*regexp*s) are patterns which describe the contents of a
|
6
|
+
# string. They're used for testing whether a string contains a given pattern, or
|
7
|
+
# extracting the portions that match. They are created with the `/`*pat*`/` and
|
8
|
+
# `%r{`*pat*`}` literals or the `Regexp.new` constructor.
|
9
|
+
#
|
10
|
+
# A regexp is usually delimited with forward slashes (`/`). For example:
|
11
|
+
#
|
12
|
+
# /hay/ =~ 'haystack' #=> 0
|
13
|
+
# /y/.match('haystack') #=> #<MatchData "y">
|
14
|
+
#
|
15
|
+
# If a string contains the pattern it is said to *match*. A literal string
|
16
|
+
# matches itself.
|
17
|
+
#
|
18
|
+
# Here 'haystack' does not contain the pattern 'needle', so it doesn't match:
|
19
|
+
#
|
20
|
+
# /needle/.match('haystack') #=> nil
|
21
|
+
#
|
22
|
+
# Here 'haystack' contains the pattern 'hay', so it matches:
|
23
|
+
#
|
24
|
+
# /hay/.match('haystack') #=> #<MatchData "hay">
|
25
|
+
#
|
26
|
+
# Specifically, `/st/` requires that the string contains the letter *s* followed
|
27
|
+
# by the letter *t*, so it matches *haystack*, also.
|
28
|
+
#
|
29
|
+
# ## `=~` and Regexp#match
|
30
|
+
#
|
31
|
+
# Pattern matching may be achieved by using `=~` operator or Regexp#match
|
32
|
+
# method.
|
33
|
+
#
|
34
|
+
# ### `=~` operator
|
35
|
+
#
|
36
|
+
# `=~` is Ruby's basic pattern-matching operator. When one operand is a regular
|
37
|
+
# expression and the other is a string then the regular expression is used as a
|
38
|
+
# pattern to match against the string. (This operator is equivalently defined
|
39
|
+
# by Regexp and String so the order of String and Regexp do not matter. Other
|
40
|
+
# classes may have different implementations of `=~`.) If a match is found, the
|
41
|
+
# operator returns index of first match in string, otherwise it returns `nil`.
|
42
|
+
#
|
43
|
+
# /hay/ =~ 'haystack' #=> 0
|
44
|
+
# 'haystack' =~ /hay/ #=> 0
|
45
|
+
# /a/ =~ 'haystack' #=> 1
|
46
|
+
# /u/ =~ 'haystack' #=> nil
|
47
|
+
#
|
48
|
+
# Using `=~` operator with a String and Regexp the `$~` global variable is set
|
49
|
+
# after a successful match. `$~` holds a MatchData object. Regexp.last_match is
|
50
|
+
# equivalent to `$~`.
|
51
|
+
#
|
52
|
+
# ### Regexp#match method
|
53
|
+
#
|
54
|
+
# The #match method returns a MatchData object:
|
55
|
+
#
|
56
|
+
# /st/.match('haystack') #=> #<MatchData "st">
|
57
|
+
#
|
58
|
+
# ## Metacharacters and Escapes
|
59
|
+
#
|
60
|
+
# The following are *metacharacters* `(`, `)`, `[`, `]`, `{`, `}`, `.`, `?`,
|
61
|
+
# `+`, `*`. They have a specific meaning when appearing in a pattern. To match
|
62
|
+
# them literally they must be backslash-escaped. To match a backslash literally,
|
63
|
+
# backslash-escape it: `\\\`.
|
64
|
+
#
|
65
|
+
# /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> #<MatchData "1 + 2 = 3?">
|
66
|
+
# /a\\\\b/.match('a\\\\b') #=> #<MatchData "a\\b">
|
67
|
+
#
|
68
|
+
# Patterns behave like double-quoted strings and can contain the same backslash
|
69
|
+
# escapes (the meaning of `\s` is different, however, see
|
70
|
+
# [below](#label-Character+Classes)).
|
71
|
+
#
|
72
|
+
# /\s\u{6771 4eac 90fd}/.match("Go to 東京都")
|
73
|
+
# #=> #<MatchData " 東京都">
|
74
|
+
#
|
75
|
+
# Arbitrary Ruby expressions can be embedded into patterns with the `#{...}`
|
76
|
+
# construct.
|
77
|
+
#
|
78
|
+
# place = "東京都"
|
79
|
+
# /#{place}/.match("Go to 東京都")
|
80
|
+
# #=> #<MatchData "東京都">
|
81
|
+
#
|
82
|
+
# ## Character Classes
|
83
|
+
#
|
84
|
+
# A *character class* is delimited with square brackets (`[`, `]`) and lists
|
85
|
+
# characters that may appear at that point in the match. `/[ab]/` means *a* or
|
86
|
+
# *b*, as opposed to `/ab/` which means *a* followed by *b*.
|
87
|
+
#
|
88
|
+
# /W[aeiou]rd/.match("Word") #=> #<MatchData "Word">
|
89
|
+
#
|
90
|
+
# Within a character class the hyphen (`-`) is a metacharacter denoting an
|
91
|
+
# inclusive range of characters. `[abcd]` is equivalent to `[a-d]`. A range can
|
92
|
+
# be followed by another range, so `[abcdwxyz]` is equivalent to `[a-dw-z]`. The
|
93
|
+
# order in which ranges or individual characters appear inside a character class
|
94
|
+
# is irrelevant.
|
95
|
+
#
|
96
|
+
# /[0-9a-f]/.match('9f') #=> #<MatchData "9">
|
97
|
+
# /[9f]/.match('9f') #=> #<MatchData "9">
|
98
|
+
#
|
99
|
+
# If the first character of a character class is a caret (`^`) the class is
|
100
|
+
# inverted: it matches any character *except* those named.
|
101
|
+
#
|
102
|
+
# /[^a-eg-z]/.match('f') #=> #<MatchData "f">
|
103
|
+
#
|
104
|
+
# A character class may contain another character class. By itself this isn't
|
105
|
+
# useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`. However,
|
106
|
+
# character classes also support the `&&` operator which performs set
|
107
|
+
# intersection on its arguments. The two can be combined as follows:
|
108
|
+
#
|
109
|
+
# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z))
|
110
|
+
#
|
111
|
+
# This is equivalent to:
|
112
|
+
#
|
113
|
+
# /[abh-w]/
|
114
|
+
#
|
115
|
+
# The following metacharacters also behave like character classes:
|
116
|
+
#
|
117
|
+
# * `/./` - Any character except a newline.
|
118
|
+
# * `/./m` - Any character (the `m` modifier enables multiline mode)
|
119
|
+
# * `/\w/` - A word character (`[a-zA-Z0-9_]`)
|
120
|
+
# * `/\W/` - A non-word character (`[^a-zA-Z0-9_]`). Please take a look at
|
121
|
+
# [Bug #4044](https://bugs.ruby-lang.org/issues/4044) if using `/\W/` with
|
122
|
+
# the `/i` modifier.
|
123
|
+
# * `/\d/` - A digit character (`[0-9]`)
|
124
|
+
# * `/\D/` - A non-digit character (`[^0-9]`)
|
125
|
+
# * `/\h/` - A hexdigit character (`[0-9a-fA-F]`)
|
126
|
+
# * `/\H/` - A non-hexdigit character (`[^0-9a-fA-F]`)
|
127
|
+
# * `/\s/` - A whitespace character: `/[ \t\r\n\f\v]/`
|
128
|
+
# * `/\S/` - A non-whitespace character: `/[^ \t\r\n\f\v]/`
|
129
|
+
# * `/\R/` - A linebreak: `\n`, `\v`, `\f`, `\r` `\u0085` (NEXT LINE),
|
130
|
+
# `\u2028` (LINE SEPARATOR), `\u2029` (PARAGRAPH SEPARATOR) or `\r\n`.
|
131
|
+
#
|
132
|
+
#
|
133
|
+
# POSIX *bracket expressions* are also similar to character classes. They
|
134
|
+
# provide a portable alternative to the above, with the added benefit that they
|
135
|
+
# encompass non-ASCII characters. For instance, `/\d/` matches only the ASCII
|
136
|
+
# decimal digits (0-9); whereas `/[[:digit:]]/` matches any character in the
|
137
|
+
# Unicode *Nd* category.
|
138
|
+
#
|
139
|
+
# * `/[[:alnum:]]/` - Alphabetic and numeric character
|
140
|
+
# * `/[[:alpha:]]/` - Alphabetic character
|
141
|
+
# * `/[[:blank:]]/` - Space or tab
|
142
|
+
# * `/[[:cntrl:]]/` - Control character
|
143
|
+
# * `/[[:digit:]]/` - Digit
|
144
|
+
# * `/[[:graph:]]/` - Non-blank character (excludes spaces, control
|
145
|
+
# characters, and similar)
|
146
|
+
# * `/[[:lower:]]/` - Lowercase alphabetical character
|
147
|
+
# * `/[[:print:]]/` - Like [:graph:], but includes the space character
|
148
|
+
# * `/[[:punct:]]/` - Punctuation character
|
149
|
+
# * `/[[:space:]]/` - Whitespace character (`[:blank:]`, newline, carriage
|
150
|
+
# return, etc.)
|
151
|
+
# * `/[[:upper:]]/` - Uppercase alphabetical
|
152
|
+
# * `/[[:xdigit:]]/` - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
|
153
|
+
#
|
154
|
+
#
|
155
|
+
# Ruby also supports the following non-POSIX character classes:
|
156
|
+
#
|
157
|
+
# * `/[[:word:]]/` - A character in one of the following Unicode general
|
158
|
+
# categories *Letter*, *Mark*, *Number*, *Connector_Punctuation*
|
159
|
+
# * `/[[:ascii:]]/` - A character in the ASCII character set
|
160
|
+
#
|
161
|
+
# # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO"
|
162
|
+
# /[[:digit:]]/.match("\u06F2") #=> #<MatchData "\u{06F2}">
|
163
|
+
# /[[:upper:]][[:lower:]]/.match("Hello") #=> #<MatchData "He">
|
164
|
+
# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> #<MatchData "A6">
|
165
|
+
#
|
166
|
+
#
|
167
|
+
# ## Repetition
|
168
|
+
#
|
169
|
+
# The constructs described so far match a single character. They can be followed
|
170
|
+
# by a repetition metacharacter to specify how many times they need to occur.
|
171
|
+
# Such metacharacters are called *quantifiers*.
|
172
|
+
#
|
173
|
+
# * `*` - Zero or more times
|
174
|
+
# * `+` - One or more times
|
175
|
+
# * `?` - Zero or one times (optional)
|
176
|
+
# * `{`*n*`}` - Exactly *n* times
|
177
|
+
# * `{`*n*`,}` - *n* or more times
|
178
|
+
# * `{,`*m*`}` - *m* or less times
|
179
|
+
# * `{`*n*`,`*m*`}` - At least *n* and at most *m* times
|
180
|
+
#
|
181
|
+
#
|
182
|
+
# At least one uppercase character ('H'), at least one lowercase character
|
183
|
+
# ('e'), two 'l' characters, then one 'o':
|
184
|
+
#
|
185
|
+
# "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> #<MatchData "Hello">
|
186
|
+
#
|
187
|
+
# Repetition is *greedy* by default: as many occurrences as possible are matched
|
188
|
+
# while still allowing the overall match to succeed. By contrast, *lazy*
|
189
|
+
# matching makes the minimal amount of matches necessary for overall success.
|
190
|
+
# Most greedy metacharacters can be made lazy by following them with `?`. For
|
191
|
+
# the `{n}` pattern, because it specifies an exact number of characters to match
|
192
|
+
# and not a variable number of characters, the `?` metacharacter instead makes
|
193
|
+
# the repeated pattern optional.
|
194
|
+
#
|
195
|
+
# Both patterns below match the string. The first uses a greedy quantifier so
|
196
|
+
# '.+' matches '<a><b>'; the second uses a lazy quantifier so '.+?' matches
|
197
|
+
# '<a>':
|
198
|
+
#
|
199
|
+
# /<.+>/.match("<a><b>") #=> #<MatchData "<a><b>">
|
200
|
+
# /<.+?>/.match("<a><b>") #=> #<MatchData "<a>">
|
201
|
+
#
|
202
|
+
# A quantifier followed by `+` matches *possessively*: once it has matched it
|
203
|
+
# does not backtrack. They behave like greedy quantifiers, but having matched
|
204
|
+
# they refuse to "give up" their match even if this jeopardises the overall
|
205
|
+
# match.
|
206
|
+
#
|
207
|
+
# ## Capturing
|
208
|
+
#
|
209
|
+
# Parentheses can be used for *capturing*. The text enclosed by the
|
210
|
+
# *n*<sup>th</sup> group of parentheses can be subsequently referred to with
|
211
|
+
# *n*. Within a pattern use the *backreference* `\n`; outside of the pattern use
|
212
|
+
# `MatchData[n]`.
|
213
|
+
#
|
214
|
+
# 'at' is captured by the first group of parentheses, then referred to later
|
215
|
+
# with `\1`:
|
216
|
+
#
|
217
|
+
# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")
|
218
|
+
# #=> #<MatchData "cat sat in" 1:"at">
|
219
|
+
#
|
220
|
+
# Regexp#match returns a MatchData object which makes the captured text
|
221
|
+
# available with its #[] method:
|
222
|
+
#
|
223
|
+
# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at'
|
224
|
+
#
|
225
|
+
# Capture groups can be referred to by name when defined with the
|
226
|
+
# `(?<`*name*`>)` or `(?'`*name*`')` constructs.
|
227
|
+
#
|
228
|
+
# /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")
|
229
|
+
# #=> #<MatchData "$3.67" dollars:"3" cents:"67">
|
230
|
+
# /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")[:dollars] #=> "3"
|
231
|
+
#
|
232
|
+
# Named groups can be backreferenced with `\k<`*name*`>`, where *name* is the
|
233
|
+
# group name.
|
234
|
+
#
|
235
|
+
# /(?<vowel>[aeiou]).\k<vowel>.\k<vowel>/.match('ototomy')
|
236
|
+
# #=> #<MatchData "ototo" vowel:"o">
|
237
|
+
#
|
238
|
+
# **Note**: A regexp can't use named backreferences and numbered backreferences
|
239
|
+
# simultaneously. Also, if a named capture is used in a regexp, then parentheses
|
240
|
+
# used for grouping which would otherwise result in a unnamed capture are
|
241
|
+
# treated as non-capturing.
|
242
|
+
#
|
243
|
+
# /(\w)(\w)/.match("ab").captures # => ["a", "b"]
|
244
|
+
# /(\w)(\w)/.match("ab").named_captures # => {}
|
245
|
+
#
|
246
|
+
# /(?<c>\w)(\w)/.match("ab").captures # => ["a"]
|
247
|
+
# /(?<c>\w)(\w)/.match("ab").named_captures # => {"c"=>"a"}
|
248
|
+
#
|
249
|
+
# When named capture groups are used with a literal regexp on the left-hand side
|
250
|
+
# of an expression and the `=~` operator, the captured text is also assigned to
|
251
|
+
# local variables with corresponding names.
|
252
|
+
#
|
253
|
+
# /\$(?<dollars>\d+)\.(?<cents>\d+)/ =~ "$3.67" #=> 0
|
254
|
+
# dollars #=> "3"
|
255
|
+
#
|
256
|
+
# ## Grouping
|
257
|
+
#
|
258
|
+
# Parentheses also *group* the terms they enclose, allowing them to be
|
259
|
+
# quantified as one *atomic* whole.
|
260
|
+
#
|
261
|
+
# The pattern below matches a vowel followed by 2 word characters:
|
262
|
+
#
|
263
|
+
# /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> #<MatchData "aen">
|
264
|
+
#
|
265
|
+
# Whereas the following pattern matches a vowel followed by a word character,
|
266
|
+
# twice, i.e. `[aeiou]\w[aeiou]\w`: 'enor'.
|
267
|
+
#
|
268
|
+
# /([aeiou]\w){2}/.match("Caenorhabditis elegans")
|
269
|
+
# #=> #<MatchData "enor" 1:"or">
|
270
|
+
#
|
271
|
+
# The `(?:`...`)` construct provides grouping without capturing. That is, it
|
272
|
+
# combines the terms it contains into an atomic whole without creating a
|
273
|
+
# backreference. This benefits performance at the slight expense of readability.
|
274
|
+
#
|
275
|
+
# The first group of parentheses captures 'n' and the second 'ti'. The second
|
276
|
+
# group is referred to later with the backreference `\2`:
|
277
|
+
#
|
278
|
+
# /I(n)ves(ti)ga\2ons/.match("Investigations")
|
279
|
+
# #=> #<MatchData "Investigations" 1:"n" 2:"ti">
|
280
|
+
#
|
281
|
+
# The first group of parentheses is now made non-capturing with '?:', so it
|
282
|
+
# still matches 'n', but doesn't create the backreference. Thus, the
|
283
|
+
# backreference `\1` now refers to 'ti'.
|
284
|
+
#
|
285
|
+
# /I(?:n)ves(ti)ga\1ons/.match("Investigations")
|
286
|
+
# #=> #<MatchData "Investigations" 1:"ti">
|
287
|
+
#
|
288
|
+
# ### Atomic Grouping
|
289
|
+
#
|
290
|
+
# Grouping can be made *atomic* with `(?>`*pat*`)`. This causes the
|
291
|
+
# subexpression *pat* to be matched independently of the rest of the expression
|
292
|
+
# such that what it matches becomes fixed for the remainder of the match, unless
|
293
|
+
# the entire subexpression must be abandoned and subsequently revisited. In this
|
294
|
+
# way *pat* is treated as a non-divisible whole. Atomic grouping is typically
|
295
|
+
# used to optimise patterns so as to prevent the regular expression engine from
|
296
|
+
# backtracking needlessly.
|
297
|
+
#
|
298
|
+
# The `"` in the pattern below matches the first character of the string, then
|
299
|
+
# `.*` matches *Quote"*. This causes the overall match to fail, so the text
|
300
|
+
# matched by `.*` is backtracked by one position, which leaves the final
|
301
|
+
# character of the string available to match `"`
|
302
|
+
#
|
303
|
+
# /".*"/.match('"Quote"') #=> #<MatchData "\"Quote\"">
|
304
|
+
#
|
305
|
+
# If `.*` is grouped atomically, it refuses to backtrack *Quote"*, even though
|
306
|
+
# this means that the overall match fails
|
307
|
+
#
|
308
|
+
# /"(?>.*)"/.match('"Quote"') #=> nil
|
309
|
+
#
|
310
|
+
# ## Subexpression Calls
|
311
|
+
#
|
312
|
+
# The `\g<`*name*`>` syntax matches the previous subexpression named *name*,
|
313
|
+
# which can be a group name or number, again. This differs from backreferences
|
314
|
+
# in that it re-executes the group rather than simply trying to re-match the
|
315
|
+
# same text.
|
316
|
+
#
|
317
|
+
# This pattern matches a *(* character and assigns it to the `paren` group,
|
318
|
+
# tries to call that the `paren` sub-expression again but fails, then matches a
|
319
|
+
# literal *)*:
|
320
|
+
#
|
321
|
+
# /\A(?<paren>\(\g<paren>*\))*\z/ =~ '()'
|
322
|
+
#
|
323
|
+
# /\A(?<paren>\(\g<paren>*\))*\z/ =~ '(())' #=> 0
|
324
|
+
# # ^1
|
325
|
+
# # ^2
|
326
|
+
# # ^3
|
327
|
+
# # ^4
|
328
|
+
# # ^5
|
329
|
+
# # ^6
|
330
|
+
# # ^7
|
331
|
+
# # ^8
|
332
|
+
# # ^9
|
333
|
+
# # ^10
|
334
|
+
#
|
335
|
+
# 1. Matches at the beginning of the string, i.e. before the first character.
|
336
|
+
# 2. Enters a named capture group called `paren`
|
337
|
+
# 3. Matches a literal *(*, the first character in the string
|
338
|
+
# 4. Calls the `paren` group again, i.e. recurses back to the second step
|
339
|
+
# 5. Re-enters the `paren` group
|
340
|
+
# 6. Matches a literal *(*, the second character in the string
|
341
|
+
# 7. Try to call `paren` a third time, but fail because doing so would prevent
|
342
|
+
# an overall successful match
|
343
|
+
# 8. Match a literal *)*, the third character in the string. Marks the end of
|
344
|
+
# the second recursive call
|
345
|
+
# 9. Match a literal *)*, the fourth character in the string
|
346
|
+
# 10. Match the end of the string
|
347
|
+
#
|
348
|
+
#
|
349
|
+
# ## Alternation
|
350
|
+
#
|
351
|
+
# The vertical bar metacharacter (`|`) combines two expressions into a single
|
352
|
+
# one that matches either of the expressions. Each expression is an
|
353
|
+
# *alternative*.
|
354
|
+
#
|
355
|
+
# /\w(and|or)\w/.match("Feliformia") #=> #<MatchData "form" 1:"or">
|
356
|
+
# /\w(and|or)\w/.match("furandi") #=> #<MatchData "randi" 1:"and">
|
357
|
+
# /\w(and|or)\w/.match("dissemblance") #=> nil
|
358
|
+
#
|
359
|
+
# ## Character Properties
|
360
|
+
#
|
361
|
+
# The `\p{}` construct matches characters with the named property, much like
|
362
|
+
# POSIX bracket classes.
|
363
|
+
#
|
364
|
+
# * `/\p{Alnum}/` - Alphabetic and numeric character
|
365
|
+
# * `/\p{Alpha}/` - Alphabetic character
|
366
|
+
# * `/\p{Blank}/` - Space or tab
|
367
|
+
# * `/\p{Cntrl}/` - Control character
|
368
|
+
# * `/\p{Digit}/` - Digit
|
369
|
+
# * `/\p{Graph}/` - Non-blank character (excludes spaces, control characters,
|
370
|
+
# and similar)
|
371
|
+
# * `/\p{Lower}/` - Lowercase alphabetical character
|
372
|
+
# * `/\p{Print}/` - Like `\p{Graph}`, but includes the space character
|
373
|
+
# * `/\p{Punct}/` - Punctuation character
|
374
|
+
# * `/\p{Space}/` - Whitespace character (`[:blank:]`, newline, carriage
|
375
|
+
# return, etc.)
|
376
|
+
# * `/\p{Upper}/` - Uppercase alphabetical
|
377
|
+
# * `/\p{XDigit}/` - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
|
378
|
+
# * `/\p{Word}/` - A member of one of the following Unicode general category
|
379
|
+
# *Letter*, *Mark*, *Number*, *Connector_Punctuation*
|
380
|
+
# * `/\p{ASCII}/` - A character in the ASCII character set
|
381
|
+
# * `/\p{Any}/` - Any Unicode character (including unassigned characters)
|
382
|
+
# * `/\p{Assigned}/` - An assigned character
|
383
|
+
#
|
384
|
+
#
|
385
|
+
# A Unicode character's *General Category* value can also be matched with
|
386
|
+
# `\p{`*Ab*`}` where *Ab* is the category's abbreviation as described below:
|
387
|
+
#
|
388
|
+
# * `/\p{L}/` - 'Letter'
|
389
|
+
# * `/\p{Ll}/` - 'Letter: Lowercase'
|
390
|
+
# * `/\p{Lm}/` - 'Letter: Mark'
|
391
|
+
# * `/\p{Lo}/` - 'Letter: Other'
|
392
|
+
# * `/\p{Lt}/` - 'Letter: Titlecase'
|
393
|
+
# * `/\p{Lu}/` - 'Letter: Uppercase
|
394
|
+
# * `/\p{Lo}/` - 'Letter: Other'
|
395
|
+
# * `/\p{M}/` - 'Mark'
|
396
|
+
# * `/\p{Mn}/` - 'Mark: Nonspacing'
|
397
|
+
# * `/\p{Mc}/` - 'Mark: Spacing Combining'
|
398
|
+
# * `/\p{Me}/` - 'Mark: Enclosing'
|
399
|
+
# * `/\p{N}/` - 'Number'
|
400
|
+
# * `/\p{Nd}/` - 'Number: Decimal Digit'
|
401
|
+
# * `/\p{Nl}/` - 'Number: Letter'
|
402
|
+
# * `/\p{No}/` - 'Number: Other'
|
403
|
+
# * `/\p{P}/` - 'Punctuation'
|
404
|
+
# * `/\p{Pc}/` - 'Punctuation: Connector'
|
405
|
+
# * `/\p{Pd}/` - 'Punctuation: Dash'
|
406
|
+
# * `/\p{Ps}/` - 'Punctuation: Open'
|
407
|
+
# * `/\p{Pe}/` - 'Punctuation: Close'
|
408
|
+
# * `/\p{Pi}/` - 'Punctuation: Initial Quote'
|
409
|
+
# * `/\p{Pf}/` - 'Punctuation: Final Quote'
|
410
|
+
# * `/\p{Po}/` - 'Punctuation: Other'
|
411
|
+
# * `/\p{S}/` - 'Symbol'
|
412
|
+
# * `/\p{Sm}/` - 'Symbol: Math'
|
413
|
+
# * `/\p{Sc}/` - 'Symbol: Currency'
|
414
|
+
# * `/\p{Sc}/` - 'Symbol: Currency'
|
415
|
+
# * `/\p{Sk}/` - 'Symbol: Modifier'
|
416
|
+
# * `/\p{So}/` - 'Symbol: Other'
|
417
|
+
# * `/\p{Z}/` - 'Separator'
|
418
|
+
# * `/\p{Zs}/` - 'Separator: Space'
|
419
|
+
# * `/\p{Zl}/` - 'Separator: Line'
|
420
|
+
# * `/\p{Zp}/` - 'Separator: Paragraph'
|
421
|
+
# * `/\p{C}/` - 'Other'
|
422
|
+
# * `/\p{Cc}/` - 'Other: Control'
|
423
|
+
# * `/\p{Cf}/` - 'Other: Format'
|
424
|
+
# * `/\p{Cn}/` - 'Other: Not Assigned'
|
425
|
+
# * `/\p{Co}/` - 'Other: Private Use'
|
426
|
+
# * `/\p{Cs}/` - 'Other: Surrogate'
|
427
|
+
#
|
428
|
+
#
|
429
|
+
# Lastly, `\p{}` matches a character's Unicode *script*. The following scripts
|
430
|
+
# are supported: *Arabic*, *Armenian*, *Balinese*, *Bengali*, *Bopomofo*,
|
431
|
+
# *Braille*, *Buginese*, *Buhid*, *Canadian_Aboriginal*, *Carian*, *Cham*,
|
432
|
+
# *Cherokee*, *Common*, *Coptic*, *Cuneiform*, *Cypriot*, *Cyrillic*, *Deseret*,
|
433
|
+
# *Devanagari*, *Ethiopic*, *Georgian*, *Glagolitic*, *Gothic*, *Greek*,
|
434
|
+
# *Gujarati*, *Gurmukhi*, *Han*, *Hangul*, *Hanunoo*, *Hebrew*, *Hiragana*,
|
435
|
+
# *Inherited*, *Kannada*, *Katakana*, *Kayah_Li*, *Kharoshthi*, *Khmer*, *Lao*,
|
436
|
+
# *Latin*, *Lepcha*, *Limbu*, *Linear_B*, *Lycian*, *Lydian*, *Malayalam*,
|
437
|
+
# *Mongolian*, *Myanmar*, *New_Tai_Lue*, *Nko*, *Ogham*, *Ol_Chiki*,
|
438
|
+
# *Old_Italic*, *Old_Persian*, *Oriya*, *Osmanya*, *Phags_Pa*, *Phoenician*,
|
439
|
+
# *Rejang*, *Runic*, *Saurashtra*, *Shavian*, *Sinhala*, *Sundanese*,
|
440
|
+
# *Syloti_Nagri*, *Syriac*, *Tagalog*, *Tagbanwa*, *Tai_Le*, *Tamil*, *Telugu*,
|
441
|
+
# *Thaana*, *Thai*, *Tibetan*, *Tifinagh*, *Ugaritic*, *Vai*, and *Yi*.
|
442
|
+
#
|
443
|
+
# Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and belongs to the
|
444
|
+
# Arabic script:
|
445
|
+
#
|
446
|
+
# /\p{Arabic}/.match("\u06E9") #=> #<MatchData "\u06E9">
|
447
|
+
#
|
448
|
+
# All character properties can be inverted by prefixing their name with a caret
|
449
|
+
# (`^`).
|
450
|
+
#
|
451
|
+
# Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so this
|
452
|
+
# match succeeds:
|
453
|
+
#
|
454
|
+
# /\p{^Ll}/.match("A") #=> #<MatchData "A">
|
455
|
+
#
|
456
|
+
# ## Anchors
|
457
|
+
#
|
458
|
+
# Anchors are metacharacter that match the zero-width positions between
|
459
|
+
# characters, *anchoring* the match to a specific position.
|
460
|
+
#
|
461
|
+
# * `^` - Matches beginning of line
|
462
|
+
# * `$` - Matches end of line
|
463
|
+
# * `\A` - Matches beginning of string.
|
464
|
+
# * `\Z` - Matches end of string. If string ends with a newline, it matches
|
465
|
+
# just before newline
|
466
|
+
# * `\z` - Matches end of string
|
467
|
+
# * `\G` - Matches first matching position:
|
468
|
+
#
|
469
|
+
# In methods like `String#gsub` and `String#scan`, it changes on each
|
470
|
+
# iteration. It initially matches the beginning of subject, and in each
|
471
|
+
# following iteration it matches where the last match finished.
|
472
|
+
#
|
473
|
+
# " a b c".gsub(/ /, '_') #=> "____a_b_c"
|
474
|
+
# " a b c".gsub(/\G /, '_') #=> "____a b c"
|
475
|
+
#
|
476
|
+
# In methods like `Regexp#match` and `String#match` that take an (optional)
|
477
|
+
# offset, it matches where the search begins.
|
478
|
+
#
|
479
|
+
# "hello, world".match(/,/, 3) #=> #<MatchData ",">
|
480
|
+
# "hello, world".match(/\G,/, 3) #=> nil
|
481
|
+
#
|
482
|
+
# * `\b` - Matches word boundaries when outside brackets; backspace (0x08)
|
483
|
+
# when inside brackets
|
484
|
+
# * `\B` - Matches non-word boundaries
|
485
|
+
# * `(?=`*pat*`)` - *Positive lookahead* assertion: ensures that the following
|
486
|
+
# characters match *pat*, but doesn't include those characters in the
|
487
|
+
# matched text
|
488
|
+
# * `(?!`*pat*`)` - *Negative lookahead* assertion: ensures that the following
|
489
|
+
# characters do not match *pat*, but doesn't include those characters in the
|
490
|
+
# matched text
|
491
|
+
# * `(?<=`*pat*`)` - *Positive lookbehind* assertion: ensures that the
|
492
|
+
# preceding characters match *pat*, but doesn't include those characters in
|
493
|
+
# the matched text
|
494
|
+
# * `(?<!`*pat*`)` - *Negative lookbehind* assertion: ensures that the
|
495
|
+
# preceding characters do not match *pat*, but doesn't include those
|
496
|
+
# characters in the matched text
|
497
|
+
#
|
498
|
+
#
|
499
|
+
# If a pattern isn't anchored it can begin at any point in the string:
|
500
|
+
#
|
501
|
+
# /real/.match("surrealist") #=> #<MatchData "real">
|
502
|
+
#
|
503
|
+
# Anchoring the pattern to the beginning of the string forces the match to start
|
504
|
+
# there. 'real' doesn't occur at the beginning of the string, so now the match
|
505
|
+
# fails:
|
506
|
+
#
|
507
|
+
# /\Areal/.match("surrealist") #=> nil
|
508
|
+
#
|
509
|
+
# The match below fails because although 'Demand' contains 'and', the pattern
|
510
|
+
# does not occur at a word boundary.
|
511
|
+
#
|
512
|
+
# /\band/.match("Demand")
|
513
|
+
#
|
514
|
+
# Whereas in the following example 'and' has been anchored to a non-word
|
515
|
+
# boundary so instead of matching the first 'and' it matches from the fourth
|
516
|
+
# letter of 'demand' instead:
|
517
|
+
#
|
518
|
+
# /\Band.+/.match("Supply and demand curve") #=> #<MatchData "and curve">
|
519
|
+
#
|
520
|
+
# The pattern below uses positive lookahead and positive lookbehind to match
|
521
|
+
# text appearing in tags without including the tags in the match:
|
522
|
+
#
|
523
|
+
# /(?<=<b>)\w+(?=<\/b>)/.match("Fortune favours the <b>bold</b>")
|
524
|
+
# #=> #<MatchData "bold">
|
525
|
+
#
|
526
|
+
# ## Options
|
527
|
+
#
|
528
|
+
# The end delimiter for a regexp can be followed by one or more single-letter
|
529
|
+
# options which control how the pattern can match.
|
530
|
+
#
|
531
|
+
# * `/pat/i` - Ignore case
|
532
|
+
# * `/pat/m` - Treat a newline as a character matched by `.`
|
533
|
+
# * `/pat/x` - Ignore whitespace and comments in the pattern
|
534
|
+
# * `/pat/o` - Perform `#{}` interpolation only once
|
535
|
+
#
|
536
|
+
#
|
537
|
+
# `i`, `m`, and `x` can also be applied on the subexpression level with the
|
538
|
+
# `(?`*on*`-`*off*`)` construct, which enables options *on*, and disables
|
539
|
+
# options *off* for the expression enclosed by the parentheses:
|
540
|
+
#
|
541
|
+
# /a(?i:b)c/.match('aBc') #=> #<MatchData "aBc">
|
542
|
+
# /a(?-i:b)c/i.match('ABC') #=> nil
|
543
|
+
#
|
544
|
+
# Additionally, these options can also be toggled for the remainder of the
|
545
|
+
# pattern:
|
546
|
+
#
|
547
|
+
# /a(?i)bc/.match('abC') #=> #<MatchData "abC">
|
548
|
+
#
|
549
|
+
# Options may also be used with `Regexp.new`:
|
550
|
+
#
|
551
|
+
# Regexp.new("abc", Regexp::IGNORECASE) #=> /abc/i
|
552
|
+
# Regexp.new("abc", Regexp::MULTILINE) #=> /abc/m
|
553
|
+
# Regexp.new("abc # Comment", Regexp::EXTENDED) #=> /abc # Comment/x
|
554
|
+
# Regexp.new("abc", Regexp::IGNORECASE | Regexp::MULTILINE) #=> /abc/mi
|
555
|
+
#
|
556
|
+
# ## Free-Spacing Mode and Comments
|
557
|
+
#
|
558
|
+
# As mentioned above, the `x` option enables *free-spacing* mode. Literal white
|
559
|
+
# space inside the pattern is ignored, and the octothorpe (`#`) character
|
560
|
+
# introduces a comment until the end of the line. This allows the components of
|
561
|
+
# the pattern to be organized in a potentially more readable fashion.
|
562
|
+
#
|
563
|
+
# A contrived pattern to match a number with optional decimal places:
|
564
|
+
#
|
565
|
+
# float_pat = /\A
|
566
|
+
# [[:digit:]]+ # 1 or more digits before the decimal point
|
567
|
+
# (\. # Decimal point
|
568
|
+
# [[:digit:]]+ # 1 or more digits after the decimal point
|
569
|
+
# )? # The decimal point and following digits are optional
|
570
|
+
# \Z/x
|
571
|
+
# float_pat.match('3.14') #=> #<MatchData "3.14" 1:".14">
|
572
|
+
#
|
573
|
+
# There are a number of strategies for matching whitespace:
|
574
|
+
#
|
575
|
+
# * Use a pattern such as `\s` or `\p{Space}`.
|
576
|
+
# * Use escaped whitespace such as `\ `, i.e. a space preceded by a backslash.
|
577
|
+
# * Use a character class such as `[ ]`.
|
578
|
+
#
|
579
|
+
#
|
580
|
+
# Comments can be included in a non-`x` pattern with the `(?#`*comment*`)`
|
581
|
+
# construct, where *comment* is arbitrary text ignored by the regexp engine.
|
582
|
+
#
|
583
|
+
# Comments in regexp literals cannot include unescaped terminator characters.
|
584
|
+
#
|
585
|
+
# ## Encoding
|
586
|
+
#
|
587
|
+
# Regular expressions are assumed to use the source encoding. This can be
|
588
|
+
# overridden with one of the following modifiers.
|
589
|
+
#
|
590
|
+
# * `/`*pat*`/u` - UTF-8
|
591
|
+
# * `/`*pat*`/e` - EUC-JP
|
592
|
+
# * `/`*pat*`/s` - Windows-31J
|
593
|
+
# * `/`*pat*`/n` - ASCII-8BIT
|
594
|
+
#
|
595
|
+
#
|
596
|
+
# A regexp can be matched against a string when they either share an encoding,
|
597
|
+
# or the regexp's encoding is *US-ASCII* and the string's encoding is
|
598
|
+
# ASCII-compatible.
|
599
|
+
#
|
600
|
+
# If a match between incompatible encodings is attempted an
|
601
|
+
# `Encoding::CompatibilityError` exception is raised.
|
602
|
+
#
|
603
|
+
# The `Regexp#fixed_encoding?` predicate indicates whether the regexp has a
|
604
|
+
# *fixed* encoding, that is one incompatible with ASCII. A regexp's encoding can
|
605
|
+
# be explicitly fixed by supplying `Regexp::FIXEDENCODING` as the second
|
606
|
+
# argument of `Regexp.new`:
|
607
|
+
#
|
608
|
+
# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING)
|
609
|
+
# r =~ "a\u3042"
|
610
|
+
# # raises Encoding::CompatibilityError: incompatible encoding regexp match
|
611
|
+
# # (ISO-8859-1 regexp with UTF-8 string)
|
612
|
+
#
|
613
|
+
# ## Special global variables
|
614
|
+
#
|
615
|
+
# Pattern matching sets some global variables :
|
616
|
+
# * `$~` is equivalent to Regexp.last_match;
|
617
|
+
# * `$&` contains the complete matched text;
|
618
|
+
# * `$`` contains string before match;
|
619
|
+
# * `$'` contains string after match;
|
620
|
+
# * `$1`, `$2` and so on contain text matching first, second, etc capture
|
621
|
+
# group;
|
622
|
+
# * `$+` contains last capture group.
|
623
|
+
#
|
624
|
+
#
|
625
|
+
# Example:
|
626
|
+
#
|
627
|
+
# m = /s(\w{2}).*(c)/.match('haystack') #=> #<MatchData "stac" 1:"ta" 2:"c">
|
628
|
+
# $~ #=> #<MatchData "stac" 1:"ta" 2:"c">
|
629
|
+
# Regexp.last_match #=> #<MatchData "stac" 1:"ta" 2:"c">
|
630
|
+
#
|
631
|
+
# $& #=> "stac"
|
632
|
+
# # same as m[0]
|
633
|
+
# $` #=> "hay"
|
634
|
+
# # same as m.pre_match
|
635
|
+
# $' #=> "k"
|
636
|
+
# # same as m.post_match
|
637
|
+
# $1 #=> "ta"
|
638
|
+
# # same as m[1]
|
639
|
+
# $2 #=> "c"
|
640
|
+
# # same as m[2]
|
641
|
+
# $3 #=> nil
|
642
|
+
# # no third group in pattern
|
643
|
+
# $+ #=> "c"
|
644
|
+
# # same as m[-1]
|
645
|
+
#
|
646
|
+
# These global variables are thread-local and method-local variables.
|
647
|
+
#
|
648
|
+
# ## Performance
|
649
|
+
#
|
650
|
+
# Certain pathological combinations of constructs can lead to abysmally bad
|
651
|
+
# performance.
|
652
|
+
#
|
653
|
+
# Consider a string of 25 *a*s, a *d*, 4 *a*s, and a *c*.
|
654
|
+
#
|
655
|
+
# s = 'a' * 25 + 'd' + 'a' * 4 + 'c'
|
656
|
+
# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadaaaac"
|
657
|
+
#
|
658
|
+
# The following patterns match instantly as you would expect:
|
659
|
+
#
|
660
|
+
# /(b|a)/ =~ s #=> 0
|
661
|
+
# /(b|a+)/ =~ s #=> 0
|
662
|
+
# /(b|a+)*/ =~ s #=> 0
|
663
|
+
#
|
664
|
+
# However, the following pattern takes appreciably longer:
|
665
|
+
#
|
666
|
+
# /(b|a+)*c/ =~ s #=> 26
|
667
|
+
#
|
668
|
+
# This happens because an atom in the regexp is quantified by both an immediate
|
669
|
+
# `+` and an enclosing `*` with nothing to differentiate which is in control of
|
670
|
+
# any particular character. The nondeterminism that results produces
|
671
|
+
# super-linear performance. (Consult *Mastering Regular Expressions* (3rd ed.),
|
672
|
+
# pp 222, by *Jeffery Friedl*, for an in-depth analysis). This particular case
|
673
|
+
# can be fixed by use of atomic grouping, which prevents the unnecessary
|
674
|
+
# backtracking:
|
675
|
+
#
|
676
|
+
# (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start)
|
677
|
+
# #=> 24.702736882
|
678
|
+
# (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start)
|
679
|
+
# #=> 0.000166571
|
680
|
+
#
|
681
|
+
# A similar case is typified by the following example, which takes approximately
|
682
|
+
# 60 seconds to execute for me:
|
683
|
+
#
|
684
|
+
# Match a string of 29 *a*s against a pattern of 29 optional *a*s followed by 29
|
685
|
+
# mandatory *a*s:
|
686
|
+
#
|
687
|
+
# Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29
|
688
|
+
#
|
689
|
+
# The 29 optional *a*s match the string, but this prevents the 29 mandatory *a*s
|
690
|
+
# that follow from matching. Ruby must then backtrack repeatedly so as to
|
691
|
+
# satisfy as many of the optional matches as it can while still matching the
|
692
|
+
# mandatory 29. It is plain to us that none of the optional matches can succeed,
|
693
|
+
# but this fact unfortunately eludes Ruby.
|
694
|
+
#
|
695
|
+
# The best way to improve performance is to significantly reduce the amount of
|
696
|
+
# backtracking needed. For this case, instead of individually matching 29
|
697
|
+
# optional *a*s, a range of optional *a*s can be matched all at once with
|
698
|
+
# *a{0,29}*:
|
699
|
+
#
|
700
|
+
# Regexp.new('a{0,29}' + 'a' * 29) =~ 'a' * 29
|
701
|
+
#
|
702
|
+
class Regexp
|
703
|
+
# Constructs a new regular expression from `pattern`, which can be either a
|
704
|
+
# String or a Regexp (in which case that regexp's options are propagated), and
|
705
|
+
# new options may not be specified (a change as of Ruby 1.8).
|
706
|
+
#
|
707
|
+
# If `options` is an Integer, it should be one or more of the constants
|
708
|
+
# Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE, *or*-ed together.
|
709
|
+
# Otherwise, if `options` is not `nil` or `false`, the regexp will be case
|
710
|
+
# insensitive.
|
711
|
+
#
|
712
|
+
# r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
|
713
|
+
# r2 = Regexp.new('cat', true) #=> /cat/i
|
714
|
+
# r3 = Regexp.new(r2) #=> /cat/i
|
715
|
+
# r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix
|
716
|
+
#
|
717
|
+
def initialize: (String string, ?untyped options, ?String kcode) -> Object
|
718
|
+
| (Regexp regexp) -> void
|
719
|
+
|
720
|
+
# Alias for Regexp.new
|
721
|
+
#
|
722
|
+
def self.compile: (String string, ?untyped options, ?String kcode) -> Regexp
|
723
|
+
| (Regexp regexp) -> Regexp
|
724
|
+
|
725
|
+
# Escapes any characters that would have special meaning in a regular
|
726
|
+
# expression. Returns a new escaped string with the same or compatible encoding.
|
727
|
+
# For any string, `Regexp.new(Regexp.escape(*str*))=~*str`* will be true.
|
728
|
+
#
|
729
|
+
# Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
|
730
|
+
#
|
731
|
+
def self.escape: (String | Symbol str) -> String
|
732
|
+
|
733
|
+
# The first form returns the MatchData object generated by the last successful
|
734
|
+
# pattern match. Equivalent to reading the special global variable `$~` (see
|
735
|
+
# Special global variables in Regexp for details).
|
736
|
+
#
|
737
|
+
# The second form returns the *n*th field in this MatchData object. *n* can be a
|
738
|
+
# string or symbol to reference a named capture.
|
739
|
+
#
|
740
|
+
# Note that the last_match is local to the thread and method scope of the method
|
741
|
+
# that did the pattern match.
|
742
|
+
#
|
743
|
+
# /c(.)t/ =~ 'cat' #=> 0
|
744
|
+
# Regexp.last_match #=> #<MatchData "cat" 1:"a">
|
745
|
+
# Regexp.last_match(0) #=> "cat"
|
746
|
+
# Regexp.last_match(1) #=> "a"
|
747
|
+
# Regexp.last_match(2) #=> nil
|
748
|
+
#
|
749
|
+
# /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
|
750
|
+
# Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
|
751
|
+
# Regexp.last_match(:lhs) #=> "var"
|
752
|
+
# Regexp.last_match(:rhs) #=> "val"
|
753
|
+
#
|
754
|
+
def self.last_match: () -> MatchData?
|
755
|
+
| (Integer n) -> String?
|
756
|
+
| (Symbol | String n) -> String?
|
757
|
+
|
758
|
+
# Escapes any characters that would have special meaning in a regular
|
759
|
+
# expression. Returns a new escaped string with the same or compatible encoding.
|
760
|
+
# For any string, `Regexp.new(Regexp.escape(*str*))=~*str`* will be true.
|
761
|
+
#
|
762
|
+
# Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
|
763
|
+
#
|
764
|
+
def self.quote: (String | Symbol str) -> String
|
765
|
+
|
766
|
+
# Try to convert *obj* into a Regexp, using to_regexp method. Returns converted
|
767
|
+
# regexp or nil if *obj* cannot be converted for any reason.
|
768
|
+
#
|
769
|
+
# Regexp.try_convert(/re/) #=> /re/
|
770
|
+
# Regexp.try_convert("re") #=> nil
|
771
|
+
#
|
772
|
+
# o = Object.new
|
773
|
+
# Regexp.try_convert(o) #=> nil
|
774
|
+
# def o.to_regexp() /foo/ end
|
775
|
+
# Regexp.try_convert(o) #=> /foo/
|
776
|
+
#
|
777
|
+
def self.try_convert: (untyped obj) -> Regexp?
|
778
|
+
|
779
|
+
# Return a Regexp object that is the union of the given *pattern*s, i.e., will
|
780
|
+
# match any of its parts. The *pattern*s can be Regexp objects, in which case
|
781
|
+
# their options will be preserved, or Strings. If no patterns are given, returns
|
782
|
+
# `/(?!)/`. The behavior is unspecified if any given *pattern* contains
|
783
|
+
# capture.
|
784
|
+
#
|
785
|
+
# Regexp.union #=> /(?!)/
|
786
|
+
# Regexp.union("penzance") #=> /penzance/
|
787
|
+
# Regexp.union("a+b*c") #=> /a\+b\*c/
|
788
|
+
# Regexp.union("skiing", "sledding") #=> /skiing|sledding/
|
789
|
+
# Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
|
790
|
+
# Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
|
791
|
+
#
|
792
|
+
# Note: the arguments for ::union will try to be converted into a regular
|
793
|
+
# expression literal via #to_regexp.
|
794
|
+
#
|
795
|
+
def self.union: () -> Regexp
|
796
|
+
| (String | Regexp pat1, *String | Regexp pat2) -> Regexp
|
797
|
+
| (::Array[String | Regexp]) -> Regexp
|
798
|
+
|
799
|
+
public
|
800
|
+
|
801
|
+
# Equality---Two regexps are equal if their patterns are identical, they have
|
802
|
+
# the same character set code, and their `casefold?` values are the same.
|
803
|
+
#
|
804
|
+
# /abc/ == /abc/x #=> false
|
805
|
+
# /abc/ == /abc/i #=> false
|
806
|
+
# /abc/ == /abc/u #=> false
|
807
|
+
# /abc/u == /abc/n #=> false
|
808
|
+
#
|
809
|
+
def ==: (untyped other) -> bool
|
810
|
+
|
811
|
+
# Case Equality---Used in case statements.
|
812
|
+
#
|
813
|
+
# a = "HELLO"
|
814
|
+
# case a
|
815
|
+
# when /\A[a-z]*\z/; print "Lower case\n"
|
816
|
+
# when /\A[A-Z]*\z/; print "Upper case\n"
|
817
|
+
# else; print "Mixed case\n"
|
818
|
+
# end
|
819
|
+
# #=> "Upper case"
|
820
|
+
#
|
821
|
+
# Following a regular expression literal with the #=== operator allows you to
|
822
|
+
# compare against a String.
|
823
|
+
#
|
824
|
+
# /^[a-z]*$/ === "HELLO" #=> false
|
825
|
+
# /^[A-Z]*$/ === "HELLO" #=> true
|
826
|
+
#
|
827
|
+
def ===: (untyped other) -> bool
|
828
|
+
|
829
|
+
# Match---Matches *rxp* against *str*.
|
830
|
+
#
|
831
|
+
# /at/ =~ "input data" #=> 7
|
832
|
+
# /ax/ =~ "input data" #=> nil
|
833
|
+
#
|
834
|
+
# If `=~` is used with a regexp literal with named captures, captured strings
|
835
|
+
# (or nil) is assigned to local variables named by the capture names.
|
836
|
+
#
|
837
|
+
# /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
|
838
|
+
# p lhs #=> "x"
|
839
|
+
# p rhs #=> "y"
|
840
|
+
#
|
841
|
+
# If it is not matched, nil is assigned for the variables.
|
842
|
+
#
|
843
|
+
# /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
|
844
|
+
# p lhs #=> nil
|
845
|
+
# p rhs #=> nil
|
846
|
+
#
|
847
|
+
# This assignment is implemented in the Ruby parser. The parser detects
|
848
|
+
# 'regexp-literal =~ expression' for the assignment. The regexp must be a
|
849
|
+
# literal without interpolation and placed at left hand side.
|
850
|
+
#
|
851
|
+
# The assignment does not occur if the regexp is not a literal.
|
852
|
+
#
|
853
|
+
# re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
|
854
|
+
# re =~ " x = y "
|
855
|
+
# p lhs # undefined local variable
|
856
|
+
# p rhs # undefined local variable
|
857
|
+
#
|
858
|
+
# A regexp interpolation, `#{}`, also disables the assignment.
|
859
|
+
#
|
860
|
+
# rhs_pat = /(?<rhs>\w+)/
|
861
|
+
# /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
|
862
|
+
# p lhs # undefined local variable
|
863
|
+
#
|
864
|
+
# The assignment does not occur if the regexp is placed at the right hand side.
|
865
|
+
#
|
866
|
+
# " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
|
867
|
+
# p lhs, rhs # undefined local variable
|
868
|
+
#
|
869
|
+
def =~: (String? str) -> Integer?
|
870
|
+
|
871
|
+
# Returns the value of the case-insensitive flag.
|
872
|
+
#
|
873
|
+
# /a/.casefold? #=> false
|
874
|
+
# /a/i.casefold? #=> true
|
875
|
+
# /(?i:a)/.casefold? #=> false
|
876
|
+
#
|
877
|
+
def casefold?: () -> bool
|
878
|
+
|
879
|
+
# Returns the Encoding object that represents the encoding of obj.
|
880
|
+
#
|
881
|
+
def encoding: () -> Encoding
|
882
|
+
|
883
|
+
# Equality---Two regexps are equal if their patterns are identical, they have
|
884
|
+
# the same character set code, and their `casefold?` values are the same.
|
885
|
+
#
|
886
|
+
# /abc/ == /abc/x #=> false
|
887
|
+
# /abc/ == /abc/i #=> false
|
888
|
+
# /abc/ == /abc/u #=> false
|
889
|
+
# /abc/u == /abc/n #=> false
|
890
|
+
#
|
891
|
+
def eql?: (untyped other) -> bool
|
892
|
+
|
893
|
+
# Returns false if rxp is applicable to a string with any ASCII compatible
|
894
|
+
# encoding. Returns true otherwise.
|
895
|
+
#
|
896
|
+
# r = /a/
|
897
|
+
# r.fixed_encoding? #=> false
|
898
|
+
# r =~ "\u{6666} a" #=> 2
|
899
|
+
# r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
|
900
|
+
# r =~ "abc".force_encoding("euc-jp") #=> 0
|
901
|
+
#
|
902
|
+
# r = /a/u
|
903
|
+
# r.fixed_encoding? #=> true
|
904
|
+
# r.encoding #=> #<Encoding:UTF-8>
|
905
|
+
# r =~ "\u{6666} a" #=> 2
|
906
|
+
# r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
|
907
|
+
# r =~ "abc".force_encoding("euc-jp") #=> 0
|
908
|
+
#
|
909
|
+
# r = /\u{6666}/
|
910
|
+
# r.fixed_encoding? #=> true
|
911
|
+
# r.encoding #=> #<Encoding:UTF-8>
|
912
|
+
# r =~ "\u{6666} a" #=> 0
|
913
|
+
# r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
|
914
|
+
# r =~ "abc".force_encoding("euc-jp") #=> nil
|
915
|
+
#
|
916
|
+
def fixed_encoding?: () -> bool
|
917
|
+
|
918
|
+
# Produce a hash based on the text and options of this regular expression.
|
919
|
+
#
|
920
|
+
# See also Object#hash.
|
921
|
+
#
|
922
|
+
def hash: () -> Integer
|
923
|
+
|
924
|
+
# Produce a nicely formatted string-version of *rxp*. Perhaps surprisingly,
|
925
|
+
# `#inspect` actually produces the more natural version of the string than
|
926
|
+
# `#to_s`.
|
927
|
+
#
|
928
|
+
# /ab+c/ix.inspect #=> "/ab+c/ix"
|
929
|
+
#
|
930
|
+
def inspect: () -> String
|
931
|
+
|
932
|
+
# Returns a MatchData object describing the match, or `nil` if there was no
|
933
|
+
# match. This is equivalent to retrieving the value of the special variable `$~`
|
934
|
+
# following a normal match. If the second parameter is present, it specifies
|
935
|
+
# the position in the string to begin the search.
|
936
|
+
#
|
937
|
+
# /(.)(.)(.)/.match("abc")[2] #=> "b"
|
938
|
+
# /(.)(.)/.match("abc", 1)[2] #=> "c"
|
939
|
+
#
|
940
|
+
# If a block is given, invoke the block with MatchData if match succeed, so that
|
941
|
+
# you can write
|
942
|
+
#
|
943
|
+
# /M(.*)/.match("Matz") do |m|
|
944
|
+
# puts m[0]
|
945
|
+
# puts m[1]
|
946
|
+
# end
|
947
|
+
#
|
948
|
+
# instead of
|
949
|
+
#
|
950
|
+
# if m = /M(.*)/.match("Matz")
|
951
|
+
# puts m[0]
|
952
|
+
# puts m[1]
|
953
|
+
# end
|
954
|
+
#
|
955
|
+
# The return value is a value from block execution in this case.
|
956
|
+
#
|
957
|
+
def match: (String? | Symbol | _ToStr str, ?Integer pos) -> MatchData?
|
958
|
+
| [T] (String? | Symbol | _ToStr str, ?Integer pos) { (MatchData) -> T } -> T?
|
959
|
+
|
960
|
+
# Returns a `true` or `false` indicates whether the regexp is matched or not
|
961
|
+
# without updating $~ and other related variables. If the second parameter is
|
962
|
+
# present, it specifies the position in the string to begin the search.
|
963
|
+
#
|
964
|
+
# /R.../.match?("Ruby") #=> true
|
965
|
+
# /R.../.match?("Ruby", 1) #=> false
|
966
|
+
# /P.../.match?("Ruby") #=> false
|
967
|
+
# $& #=> nil
|
968
|
+
#
|
969
|
+
def match?: (String? | Symbol | _ToStr str, ?Integer pos) -> bool
|
970
|
+
|
971
|
+
# Returns a hash representing information about named captures of *rxp*.
|
972
|
+
#
|
973
|
+
# A key of the hash is a name of the named captures. A value of the hash is an
|
974
|
+
# array which is list of indexes of corresponding named captures.
|
975
|
+
#
|
976
|
+
# /(?<foo>.)(?<bar>.)/.named_captures
|
977
|
+
# #=> {"foo"=>[1], "bar"=>[2]}
|
978
|
+
#
|
979
|
+
# /(?<foo>.)(?<foo>.)/.named_captures
|
980
|
+
# #=> {"foo"=>[1, 2]}
|
981
|
+
#
|
982
|
+
# If there are no named captures, an empty hash is returned.
|
983
|
+
#
|
984
|
+
# /(.)(.)/.named_captures
|
985
|
+
# #=> {}
|
986
|
+
#
|
987
|
+
def named_captures: () -> ::Hash[String, ::Array[Integer]]
|
988
|
+
|
989
|
+
# Returns a list of names of captures as an array of strings.
|
990
|
+
#
|
991
|
+
# /(?<foo>.)(?<bar>.)(?<baz>.)/.names
|
992
|
+
# #=> ["foo", "bar", "baz"]
|
993
|
+
#
|
994
|
+
# /(?<foo>.)(?<foo>.)/.names
|
995
|
+
# #=> ["foo"]
|
996
|
+
#
|
997
|
+
# /(.)(.)/.names
|
998
|
+
# #=> []
|
999
|
+
#
|
1000
|
+
def names: () -> ::Array[String]
|
1001
|
+
|
1002
|
+
# Returns the set of bits corresponding to the options used when creating this
|
1003
|
+
# Regexp (see Regexp::new for details. Note that additional bits may be set in
|
1004
|
+
# the returned options: these are used internally by the regular expression
|
1005
|
+
# code. These extra bits are ignored if the options are passed to Regexp::new.
|
1006
|
+
#
|
1007
|
+
# Regexp::IGNORECASE #=> 1
|
1008
|
+
# Regexp::EXTENDED #=> 2
|
1009
|
+
# Regexp::MULTILINE #=> 4
|
1010
|
+
#
|
1011
|
+
# /cat/.options #=> 0
|
1012
|
+
# /cat/ix.options #=> 3
|
1013
|
+
# Regexp.new('cat', true).options #=> 1
|
1014
|
+
# /\xa1\xa2/e.options #=> 16
|
1015
|
+
#
|
1016
|
+
# r = /cat/ix
|
1017
|
+
# Regexp.new(r.source, r.options) #=> /cat/ix
|
1018
|
+
#
|
1019
|
+
def options: () -> Integer
|
1020
|
+
|
1021
|
+
# Returns the original string of the pattern.
|
1022
|
+
#
|
1023
|
+
# /ab+c/ix.source #=> "ab+c"
|
1024
|
+
#
|
1025
|
+
# Note that escape sequences are retained as is.
|
1026
|
+
#
|
1027
|
+
# /\x20\+/.source #=> "\\x20\\+"
|
1028
|
+
#
|
1029
|
+
def source: () -> String
|
1030
|
+
|
1031
|
+
# Returns a string containing the regular expression and its options (using the
|
1032
|
+
# `(?opts:source)` notation. This string can be fed back in to Regexp::new to a
|
1033
|
+
# regular expression with the same semantics as the original. (However,
|
1034
|
+
# `Regexp#==` may not return true when comparing the two, as the source of the
|
1035
|
+
# regular expression itself may differ, as the example shows). Regexp#inspect
|
1036
|
+
# produces a generally more readable version of *rxp*.
|
1037
|
+
#
|
1038
|
+
# r1 = /ab+c/ix #=> /ab+c/ix
|
1039
|
+
# s1 = r1.to_s #=> "(?ix-m:ab+c)"
|
1040
|
+
# r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
|
1041
|
+
# r1 == r2 #=> false
|
1042
|
+
# r1.source #=> "ab+c"
|
1043
|
+
# r2.source #=> "(?ix-m:ab+c)"
|
1044
|
+
#
|
1045
|
+
def to_s: () -> String
|
1046
|
+
|
1047
|
+
# Match---Matches *rxp* against the contents of `$_`. Equivalent to *`rxp* =~
|
1048
|
+
# $_`.
|
1049
|
+
#
|
1050
|
+
# $_ = "input data"
|
1051
|
+
# ~ /at/ #=> 7
|
1052
|
+
#
|
1053
|
+
def ~: () -> Integer?
|
1054
|
+
|
1055
|
+
private
|
1056
|
+
|
1057
|
+
def initialize_copy: (self object) -> self
|
1058
|
+
end
|
1059
|
+
|
1060
|
+
# see Regexp.options and Regexp.new
|
1061
|
+
#
|
1062
|
+
#
|
1063
|
+
Regexp::EXTENDED: Integer
|
1064
|
+
|
1065
|
+
# see Regexp.options and Regexp.new
|
1066
|
+
#
|
1067
|
+
#
|
1068
|
+
Regexp::FIXEDENCODING: Integer
|
1069
|
+
|
1070
|
+
# see Regexp.options and Regexp.new
|
1071
|
+
#
|
1072
|
+
#
|
1073
|
+
Regexp::IGNORECASE: Integer
|
1074
|
+
|
1075
|
+
# see Regexp.options and Regexp.new
|
1076
|
+
#
|
1077
|
+
#
|
1078
|
+
Regexp::MULTILINE: Integer
|
1079
|
+
|
1080
|
+
# see Regexp.options and Regexp.new
|
1081
|
+
#
|
1082
|
+
#
|
1083
|
+
Regexp::NOENCODING: Integer
|