rust_regexp 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -0
- data/ext/rust_regexp/src/lib.rs +28 -24
- data/lib/rust_regexp/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3fa462b537e9c799b5939981d347a240e31d6cf982cee50d3c63dff6f2fbc98b
|
4
|
+
data.tar.gz: bf19b0f1a052b8961645f216e4b2ce9ee4168f2447311752e1c7550f57f64286
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cf864254f850c217dfb4c74a861c59748de044306a43a702c9c4fc677fa5a2afef9e9231a24e47f6bbce6ee6ae382f370463fafa8782e290e15d24728d1c932
|
7
|
+
data.tar.gz: 3e8f831f749e1ff403d4b72b5e49ad07ae7603a878086cd192cbe396760cb3682a02715f43cdcce49a4ef2a545255225ab8db33af91c9de17e89f23e53327c82
|
data/README.md
CHANGED
@@ -114,6 +114,28 @@ RustRegexp::Set.new(["abc", "def"]).patterns
|
|
114
114
|
# => ["abc", "def"]
|
115
115
|
```
|
116
116
|
|
117
|
+
## Encoding
|
118
|
+
|
119
|
+
Currently, `rust_regexp` expects the haystack to be an UTF-8 string.
|
120
|
+
|
121
|
+
It also supports parsing of strings with invalid UTF-8 characters by default. It's achieved via using `regex::bytes` instead of plain `regex` under the hood, so any byte sequence can be matched. The output match is encoded as UTF-8 string.
|
122
|
+
|
123
|
+
In case unicode awarness of matchers should be disabled, both `RustRegexp` and `RustRegexp::Set` support `unicode: false` option:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
RustRegexp.new('\w+').match('ю٤夏')
|
127
|
+
# => ["ю٤夏"]
|
128
|
+
|
129
|
+
RustRegexp.new('\w+', unicode: false).match('ю٤夏')
|
130
|
+
# => []
|
131
|
+
|
132
|
+
RustRegexp::Set.new(['\w', '\d', '\s']).match("ю٤\u2000")
|
133
|
+
# => [0, 1, 2]
|
134
|
+
|
135
|
+
RustRegexp::Set.new(['\w', '\d', '\s'], unicode: false).match("ю٤\u2000")
|
136
|
+
# => []
|
137
|
+
```
|
138
|
+
|
117
139
|
## Development
|
118
140
|
|
119
141
|
```sh
|
data/ext/rust_regexp/src/lib.rs
CHANGED
@@ -1,28 +1,29 @@
|
|
1
1
|
use magnus::{
|
2
|
-
class,
|
3
|
-
define_class,
|
2
|
+
class, define_class,
|
4
3
|
encoding::RbEncoding,
|
5
|
-
exception,
|
6
|
-
|
7
|
-
|
8
|
-
prelude::*,
|
9
|
-
scan_args::scan_args,
|
10
|
-
Value,
|
11
|
-
Error,
|
12
|
-
RString,
|
13
|
-
RArray,
|
4
|
+
exception, function, method,
|
5
|
+
scan_args::{get_kwargs, scan_args},
|
6
|
+
Error, Module, Object, RArray, RHash, RString, Value,
|
14
7
|
};
|
15
|
-
use regex::bytes::{Regex, RegexSet,
|
8
|
+
use regex::bytes::{Match, Regex, RegexBuilder, RegexSet, RegexSetBuilder};
|
16
9
|
|
17
10
|
#[magnus::wrap(class = "RustRegexp", free_immediately, size)]
|
18
11
|
pub struct RustRegexp(Regex);
|
19
12
|
|
20
13
|
impl RustRegexp {
|
21
14
|
pub fn new(args: &[Value]) -> Result<Self, Error> {
|
22
|
-
let args = scan_args::<(String,), (), (), (),
|
15
|
+
let args = scan_args::<(String,), (), (), (), RHash, ()>(args)?;
|
16
|
+
let kwargs = get_kwargs::<_, (), (Option<bool>,), ()>(args.keywords, &[], &["unicode"])?;
|
17
|
+
|
23
18
|
let pattern = args.required.0;
|
19
|
+
let (unicode,) = kwargs.optional;
|
20
|
+
let unicode = unicode.unwrap_or_else(|| true);
|
24
21
|
|
25
|
-
let
|
22
|
+
let mut builder = RegexBuilder::new(&pattern);
|
23
|
+
let regex = builder
|
24
|
+
.unicode(unicode)
|
25
|
+
.build()
|
26
|
+
.map_err(|e| Error::new(exception::arg_error(), e.to_string()))?;
|
26
27
|
|
27
28
|
Ok(Self(regex))
|
28
29
|
}
|
@@ -90,9 +91,7 @@ impl RustRegexp {
|
|
90
91
|
}
|
91
92
|
}
|
92
93
|
|
93
|
-
result
|
94
|
-
.push(group)
|
95
|
-
.expect("Non-frozen array");
|
94
|
+
result.push(group).expect("Non-frozen array");
|
96
95
|
}
|
97
96
|
}
|
98
97
|
|
@@ -113,10 +112,7 @@ impl RustRegexp {
|
|
113
112
|
}
|
114
113
|
|
115
114
|
fn capture_to_ruby_string(capture: &Match) -> RString {
|
116
|
-
RString::enc_new(
|
117
|
-
capture.as_bytes(),
|
118
|
-
RbEncoding::utf8()
|
119
|
-
)
|
115
|
+
RString::enc_new(capture.as_bytes(), RbEncoding::utf8())
|
120
116
|
}
|
121
117
|
}
|
122
118
|
|
@@ -125,10 +121,18 @@ pub struct RustRegexpSet(RegexSet);
|
|
125
121
|
|
126
122
|
impl RustRegexpSet {
|
127
123
|
pub fn new(args: &[Value]) -> Result<Self, Error> {
|
128
|
-
let args = scan_args::<(Vec<String>,), (), (), (),
|
124
|
+
let args = scan_args::<(Vec<String>,), (), (), (), RHash, ()>(args)?;
|
125
|
+
let kwargs = get_kwargs::<_, (), (Option<bool>,), ()>(args.keywords, &[], &["unicode"])?;
|
126
|
+
|
129
127
|
let patterns = args.required.0;
|
128
|
+
let (unicode,) = kwargs.optional;
|
129
|
+
let unicode = unicode.unwrap_or_else(|| true);
|
130
130
|
|
131
|
-
let
|
131
|
+
let mut builder = RegexSetBuilder::new(patterns);
|
132
|
+
let set = builder
|
133
|
+
.unicode(unicode)
|
134
|
+
.build()
|
135
|
+
.map_err(|e| Error::new(exception::arg_error(), e.to_string()))?;
|
132
136
|
|
133
137
|
Ok(Self(set))
|
134
138
|
}
|
@@ -137,7 +141,7 @@ impl RustRegexpSet {
|
|
137
141
|
let set = &self.0;
|
138
142
|
let haystack = unsafe { haystack.as_slice() };
|
139
143
|
|
140
|
-
set.matches(haystack).
|
144
|
+
set.matches(haystack).iter().collect()
|
141
145
|
}
|
142
146
|
|
143
147
|
pub fn is_match(&self, haystack: RString) -> bool {
|
data/lib/rust_regexp/version.rb
CHANGED