rust_regexp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +44 -13
- data/ext/rust_regexp/src/lib.rs +31 -31
- data/lib/rust_regexp/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3fa462b537e9c799b5939981d347a240e31d6cf982cee50d3c63dff6f2fbc98b
|
4
|
+
data.tar.gz: bf19b0f1a052b8961645f216e4b2ce9ee4168f2447311752e1c7550f57f64286
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cf864254f850c217dfb4c74a861c59748de044306a43a702c9c4fc677fa5a2afef9e9231a24e47f6bbce6ee6ae382f370463fafa8782e290e15d24728d1c932
|
7
|
+
data.tar.gz: 3e8f831f749e1ff403d4b72b5e49ad07ae7603a878086cd192cbe396760cb3682a02715f43cdcce49a4ef2a545255225ab8db33af91c9de17e89f23e53327c82
|
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# RustRegexp
|
2
2
|
|
3
|
-
|
3
|
+
[](https://badge.fury.io/rb/rust_regexp)
|
4
|
+
[](https://github.com/ocvit/rust_regexp/actions)
|
5
|
+
|
6
|
+
Ruby bindings for [rust/regex](https://docs.rs/regex/latest/regex/) library.
|
4
7
|
|
5
8
|
## Installation
|
6
9
|
|
@@ -24,45 +27,51 @@ require "rust_regexp"
|
|
24
27
|
|
25
28
|
## Usage
|
26
29
|
|
27
|
-
Regular expressions should pre-compiled before use:
|
30
|
+
Regular expressions should be pre-compiled before use:
|
28
31
|
|
29
32
|
```ruby
|
30
|
-
re = RustRegexp.new('
|
33
|
+
re = RustRegexp.new('p.t{2}ern*')
|
31
34
|
# => #<RustRegexp:...>
|
32
35
|
```
|
33
36
|
|
34
37
|
> [!TIP]
|
35
38
|
> Note the use of *single quotes* when passing the regular expression as
|
36
|
-
> a string to `
|
39
|
+
> a string to `rust/regex` so that the backslashes aren't interpreted as escapes.
|
37
40
|
|
38
41
|
To find a single match in the haystack:
|
39
42
|
|
40
43
|
```ruby
|
41
|
-
|
44
|
+
RustRegexp.new('\w+:\d+').match("ruby:123, rust:456")
|
45
|
+
# => ["ruby:123"]
|
46
|
+
|
47
|
+
RustRegexp.new('(\w+):(\d+)').match("ruby:123, rust:456")
|
42
48
|
# => ["ruby", "123"]
|
43
49
|
```
|
44
50
|
|
45
51
|
To find all matches in the haystack:
|
46
52
|
|
47
53
|
```ruby
|
48
|
-
|
54
|
+
RustRegexp.new('\w+:\d+').scan("ruby:123, rust:456")
|
55
|
+
# => ["ruby:123", "rust:456"]
|
56
|
+
|
57
|
+
RustRegexp.new('(\w+):(\d+)').scan("ruby:123, rust:456")
|
49
58
|
# => [["ruby", "123"], ["rust", "456"]]
|
50
59
|
```
|
51
60
|
|
52
61
|
To check whether there is at least one match in the haystack:
|
53
62
|
|
54
63
|
```ruby
|
55
|
-
|
64
|
+
RustRegexp.new('\w+:\d+').match?("ruby:123")
|
56
65
|
# => true
|
57
66
|
|
58
|
-
|
67
|
+
RustRegexp.new('\w+:\d+').match?("ruby")
|
59
68
|
# => false
|
60
69
|
```
|
61
70
|
|
62
71
|
Inspect original pattern:
|
63
72
|
|
64
73
|
```ruby
|
65
|
-
|
74
|
+
RustRegexp.new('\w+:\d+').pattern
|
66
75
|
# => "(\\w+):(\\d+)"
|
67
76
|
```
|
68
77
|
|
@@ -91,18 +100,40 @@ set.match("ghidefabc") # => [0, 1, 2]
|
|
91
100
|
To check whether at least one pattern from the set matches the haystack:
|
92
101
|
|
93
102
|
```ruby
|
94
|
-
|
103
|
+
RustRegexp::Set.new(["abc", "def"]).match?("abc")
|
95
104
|
# => true
|
96
105
|
|
97
|
-
|
106
|
+
RustRegexp::Set.new(["abc", "def"]).match?("123")
|
98
107
|
# => false
|
99
108
|
```
|
100
109
|
|
101
110
|
Inspect original patterns:
|
102
111
|
|
103
112
|
```ruby
|
104
|
-
|
105
|
-
# => ["abc", "def"
|
113
|
+
RustRegexp::Set.new(["abc", "def"]).patterns
|
114
|
+
# => ["abc", "def"]
|
115
|
+
```
|
116
|
+
|
117
|
+
## Encoding
|
118
|
+
|
119
|
+
Currently, `rust_regexp` expects the haystack to be an UTF-8 string.
|
120
|
+
|
121
|
+
It also supports parsing of strings with invalid UTF-8 characters by default. It's achieved via using `regex::bytes` instead of plain `regex` under the hood, so any byte sequence can be matched. The output match is encoded as UTF-8 string.
|
122
|
+
|
123
|
+
In case unicode awarness of matchers should be disabled, both `RustRegexp` and `RustRegexp::Set` support `unicode: false` option:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
RustRegexp.new('\w+').match('ю٤夏')
|
127
|
+
# => ["ю٤夏"]
|
128
|
+
|
129
|
+
RustRegexp.new('\w+', unicode: false).match('ю٤夏')
|
130
|
+
# => []
|
131
|
+
|
132
|
+
RustRegexp::Set.new(['\w', '\d', '\s']).match("ю٤\u2000")
|
133
|
+
# => [0, 1, 2]
|
134
|
+
|
135
|
+
RustRegexp::Set.new(['\w', '\d', '\s'], unicode: false).match("ю٤\u2000")
|
136
|
+
# => []
|
106
137
|
```
|
107
138
|
|
108
139
|
## Development
|
data/ext/rust_regexp/src/lib.rs
CHANGED
@@ -1,28 +1,29 @@
|
|
1
1
|
use magnus::{
|
2
|
-
class,
|
3
|
-
define_class,
|
2
|
+
class, define_class,
|
4
3
|
encoding::RbEncoding,
|
5
|
-
exception,
|
6
|
-
|
7
|
-
|
8
|
-
prelude::*,
|
9
|
-
scan_args::scan_args,
|
10
|
-
Value,
|
11
|
-
Error,
|
12
|
-
RString,
|
13
|
-
RArray,
|
4
|
+
exception, function, method,
|
5
|
+
scan_args::{get_kwargs, scan_args},
|
6
|
+
Error, Module, Object, RArray, RHash, RString, Value,
|
14
7
|
};
|
15
|
-
use regex::bytes::{Regex, RegexSet,
|
8
|
+
use regex::bytes::{Match, Regex, RegexBuilder, RegexSet, RegexSetBuilder};
|
16
9
|
|
17
10
|
#[magnus::wrap(class = "RustRegexp", free_immediately, size)]
|
18
11
|
pub struct RustRegexp(Regex);
|
19
12
|
|
20
13
|
impl RustRegexp {
|
21
14
|
pub fn new(args: &[Value]) -> Result<Self, Error> {
|
22
|
-
let args = scan_args::<(String,), (), (), (),
|
15
|
+
let args = scan_args::<(String,), (), (), (), RHash, ()>(args)?;
|
16
|
+
let kwargs = get_kwargs::<_, (), (Option<bool>,), ()>(args.keywords, &[], &["unicode"])?;
|
17
|
+
|
23
18
|
let pattern = args.required.0;
|
19
|
+
let (unicode,) = kwargs.optional;
|
20
|
+
let unicode = unicode.unwrap_or_else(|| true);
|
24
21
|
|
25
|
-
let
|
22
|
+
let mut builder = RegexBuilder::new(&pattern);
|
23
|
+
let regex = builder
|
24
|
+
.unicode(unicode)
|
25
|
+
.build()
|
26
|
+
.map_err(|e| Error::new(exception::arg_error(), e.to_string()))?;
|
26
27
|
|
27
28
|
Ok(Self(regex))
|
28
29
|
}
|
@@ -33,6 +34,7 @@ impl RustRegexp {
|
|
33
34
|
let regex = &self.0;
|
34
35
|
let haystack = unsafe { haystack.as_slice() };
|
35
36
|
|
37
|
+
// no capture groups defined except the default one
|
36
38
|
if regex.captures_len() == 1 {
|
37
39
|
// speed optimization, `.find` is faster than `.captures`
|
38
40
|
if let Some(capture) = regex.find(haystack) {
|
@@ -65,17 +67,12 @@ impl RustRegexp {
|
|
65
67
|
let regex = &self.0;
|
66
68
|
let haystack = unsafe { haystack.as_slice() };
|
67
69
|
|
70
|
+
// no capture groups defined except the default one
|
68
71
|
if regex.captures_len() == 1 {
|
69
72
|
// speed optimization, `.find_iter` is faster than `.captures_iter`
|
70
73
|
for capture in regex.find_iter(haystack) {
|
71
|
-
let group = RArray::with_capacity(1);
|
72
|
-
|
73
|
-
group
|
74
|
-
.push(Self::capture_to_ruby_string(&capture))
|
75
|
-
.expect("Non-frozen array");
|
76
|
-
|
77
74
|
result
|
78
|
-
.push(
|
75
|
+
.push(Self::capture_to_ruby_string(&capture))
|
79
76
|
.expect("Non-frozen array");
|
80
77
|
}
|
81
78
|
} else {
|
@@ -94,9 +91,7 @@ impl RustRegexp {
|
|
94
91
|
}
|
95
92
|
}
|
96
93
|
|
97
|
-
result
|
98
|
-
.push(group)
|
99
|
-
.expect("Non-frozen array");
|
94
|
+
result.push(group).expect("Non-frozen array");
|
100
95
|
}
|
101
96
|
}
|
102
97
|
|
@@ -117,10 +112,7 @@ impl RustRegexp {
|
|
117
112
|
}
|
118
113
|
|
119
114
|
fn capture_to_ruby_string(capture: &Match) -> RString {
|
120
|
-
RString::enc_new(
|
121
|
-
capture.as_bytes(),
|
122
|
-
RbEncoding::utf8()
|
123
|
-
)
|
115
|
+
RString::enc_new(capture.as_bytes(), RbEncoding::utf8())
|
124
116
|
}
|
125
117
|
}
|
126
118
|
|
@@ -129,10 +121,18 @@ pub struct RustRegexpSet(RegexSet);
|
|
129
121
|
|
130
122
|
impl RustRegexpSet {
|
131
123
|
pub fn new(args: &[Value]) -> Result<Self, Error> {
|
132
|
-
let args = scan_args::<(Vec<String>,), (), (), (),
|
124
|
+
let args = scan_args::<(Vec<String>,), (), (), (), RHash, ()>(args)?;
|
125
|
+
let kwargs = get_kwargs::<_, (), (Option<bool>,), ()>(args.keywords, &[], &["unicode"])?;
|
126
|
+
|
133
127
|
let patterns = args.required.0;
|
128
|
+
let (unicode,) = kwargs.optional;
|
129
|
+
let unicode = unicode.unwrap_or_else(|| true);
|
134
130
|
|
135
|
-
let
|
131
|
+
let mut builder = RegexSetBuilder::new(patterns);
|
132
|
+
let set = builder
|
133
|
+
.unicode(unicode)
|
134
|
+
.build()
|
135
|
+
.map_err(|e| Error::new(exception::arg_error(), e.to_string()))?;
|
136
136
|
|
137
137
|
Ok(Self(set))
|
138
138
|
}
|
@@ -141,7 +141,7 @@ impl RustRegexpSet {
|
|
141
141
|
let set = &self.0;
|
142
142
|
let haystack = unsafe { haystack.as_slice() };
|
143
143
|
|
144
|
-
set.matches(haystack).
|
144
|
+
set.matches(haystack).iter().collect()
|
145
145
|
}
|
146
146
|
|
147
147
|
pub fn is_match(&self, haystack: RString) -> bool {
|
data/lib/rust_regexp/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rust_regexp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmytro Horoshko
|
@@ -9,7 +9,7 @@ bindir: exe
|
|
9
9
|
cert_chain: []
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies: []
|
12
|
-
description: Simple bindings
|
12
|
+
description: Simple bindings for rust/regex library.
|
13
13
|
email:
|
14
14
|
- electric.molfar@gmail.com
|
15
15
|
executables: []
|