regexgen 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +23 -0
- data/.gitignore +9 -0
- data/.rubocop.yml +84 -0
- data/.rubocop_todo.yml +43 -0
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.txt +21 -0
- data/README.md +85 -0
- data/Rakefile +12 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/regexgen.rb +14 -0
- data/lib/regexgen/ast.rb +199 -0
- data/lib/regexgen/minimize.rb +95 -0
- data/lib/regexgen/regex.rb +136 -0
- data/lib/regexgen/set.rb +19 -0
- data/lib/regexgen/state.rb +34 -0
- data/lib/regexgen/trie.rb +45 -0
- data/lib/regexgen/version.rb +5 -0
- data/regexgen.gemspec +30 -0
- metadata +95 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 46f71792a530cbee9556f1d401be2095c27b89d177c8cfbb0a5a2bacbdbb4c43
|
4
|
+
data.tar.gz: 749f8d8e47d1539239e47d7c19ad688fd7056d75278a94a7dd555248a3649f9b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 03f4f12753dee716dadde7e6b3e244356327506ec598a5ffec93427cfb42dd20ff2732005a8907915d7155300fbe4511aaf857b19eb123d35a8463598e29d3e8
|
7
|
+
data.tar.gz: 89490be6ae578fde9e753221afdaa1d83767af9e77f638b6d788d3b632f22d81ac608740d7e22c15edd9145d706aee3f333343e013e1bc5b9cf4ab529e568f5e
|
@@ -0,0 +1,23 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ master ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ master ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
build:
|
11
|
+
|
12
|
+
runs-on: ubuntu-latest
|
13
|
+
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v2
|
16
|
+
- name: Setup Ruby, JRuby and TruffleRuby
|
17
|
+
uses: ruby/setup-ruby@v1.40.0
|
18
|
+
- name: Install dependencies
|
19
|
+
run: bundle install
|
20
|
+
- name: Run tests
|
21
|
+
run: bundle exec rake test
|
22
|
+
- name: Lint
|
23
|
+
run: bundle exec rubocop
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
3
|
+
Style/AsciiComments:
|
4
|
+
Enabled: false
|
5
|
+
|
6
|
+
Naming/MethodParameterName:
|
7
|
+
Exclude:
|
8
|
+
- 'lib/regexgen/regex.rb'
|
9
|
+
- 'lib/regexgen/ast.rb'
|
10
|
+
|
11
|
+
Layout/EmptyLinesAroundAttributeAccessor:
|
12
|
+
Enabled: true
|
13
|
+
Layout/SpaceAroundMethodCallOperator:
|
14
|
+
Enabled: true
|
15
|
+
Lint/BinaryOperatorWithIdenticalOperands:
|
16
|
+
Enabled: true
|
17
|
+
Lint/DeprecatedOpenSSLConstant:
|
18
|
+
Enabled: true
|
19
|
+
Lint/DuplicateElsifCondition:
|
20
|
+
Enabled: true
|
21
|
+
Lint/DuplicateRescueException:
|
22
|
+
Enabled: true
|
23
|
+
Lint/EmptyConditionalBody:
|
24
|
+
Enabled: true
|
25
|
+
Lint/FloatComparison:
|
26
|
+
Enabled: true
|
27
|
+
Lint/MissingSuper:
|
28
|
+
Enabled: true
|
29
|
+
Lint/MixedRegexpCaptureTypes:
|
30
|
+
Enabled: true
|
31
|
+
Lint/OutOfRangeRegexpRef:
|
32
|
+
Enabled: true
|
33
|
+
Lint/RaiseException:
|
34
|
+
Enabled: true
|
35
|
+
Lint/SelfAssignment:
|
36
|
+
Enabled: true
|
37
|
+
Lint/StructNewOverride:
|
38
|
+
Enabled: true
|
39
|
+
Lint/TopLevelReturnWithArgument:
|
40
|
+
Enabled: true
|
41
|
+
Lint/UnreachableLoop:
|
42
|
+
Enabled: true
|
43
|
+
Style/AccessorGrouping:
|
44
|
+
Enabled: true
|
45
|
+
Style/ArrayCoercion:
|
46
|
+
Enabled: true
|
47
|
+
Style/BisectedAttrAccessor:
|
48
|
+
Enabled: true
|
49
|
+
Style/CaseLikeIf:
|
50
|
+
Enabled: true
|
51
|
+
Style/ExplicitBlockArgument:
|
52
|
+
Enabled: true
|
53
|
+
Style/ExponentialNotation:
|
54
|
+
Enabled: true
|
55
|
+
Style/GlobalStdStream:
|
56
|
+
Enabled: true
|
57
|
+
Style/HashAsLastArrayItem:
|
58
|
+
Enabled: true
|
59
|
+
Style/HashEachMethods:
|
60
|
+
Enabled: true
|
61
|
+
Style/HashLikeCase:
|
62
|
+
Enabled: true
|
63
|
+
Style/HashTransformKeys:
|
64
|
+
Enabled: true
|
65
|
+
Style/HashTransformValues:
|
66
|
+
Enabled: true
|
67
|
+
Style/OptionalBooleanParameter:
|
68
|
+
Enabled: true
|
69
|
+
Style/RedundantAssignment:
|
70
|
+
Enabled: true
|
71
|
+
Style/RedundantFetchBlock:
|
72
|
+
Enabled: true
|
73
|
+
Style/RedundantFileExtensionInRequire:
|
74
|
+
Enabled: true
|
75
|
+
Style/RedundantRegexpCharacterClass:
|
76
|
+
Enabled: true
|
77
|
+
Style/RedundantRegexpEscape:
|
78
|
+
Enabled: true
|
79
|
+
Style/SingleArgumentDig:
|
80
|
+
Enabled: true
|
81
|
+
Style/SlicingWithRange:
|
82
|
+
Enabled: true
|
83
|
+
Style/StringConcatenation:
|
84
|
+
Enabled: true
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2020-08-10 13:40:30 UTC using RuboCop version 0.89.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 5
|
10
|
+
# Configuration parameters: IgnoredMethods.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 66
|
13
|
+
|
14
|
+
# Offense count: 7
|
15
|
+
# Configuration parameters: IgnoredMethods.
|
16
|
+
Metrics/CyclomaticComplexity:
|
17
|
+
Max: 22
|
18
|
+
|
19
|
+
# Offense count: 6
|
20
|
+
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods.
|
21
|
+
Metrics/MethodLength:
|
22
|
+
Max: 39
|
23
|
+
|
24
|
+
# Offense count: 1
|
25
|
+
# Configuration parameters: CountComments, CountAsOne.
|
26
|
+
Metrics/ModuleLength:
|
27
|
+
Max: 103
|
28
|
+
|
29
|
+
# Offense count: 6
|
30
|
+
# Configuration parameters: IgnoredMethods.
|
31
|
+
Metrics/PerceivedComplexity:
|
32
|
+
Max: 24
|
33
|
+
|
34
|
+
# Offense count: 5
|
35
|
+
Style/Documentation:
|
36
|
+
Exclude:
|
37
|
+
- 'spec/**/*'
|
38
|
+
- 'test/**/*'
|
39
|
+
- 'lib/regexgen/minimize.rb'
|
40
|
+
- 'lib/regexgen/regex.rb'
|
41
|
+
- 'lib/regexgen/set.rb'
|
42
|
+
- 'lib/regexgen/state.rb'
|
43
|
+
- 'lib/regexgen/trie.rb'
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.6.5
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
regexgen (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.1)
|
10
|
+
byebug (11.1.3)
|
11
|
+
minitest (5.14.1)
|
12
|
+
parallel (1.19.2)
|
13
|
+
parser (2.7.1.4)
|
14
|
+
ast (~> 2.4.1)
|
15
|
+
rainbow (3.0.0)
|
16
|
+
rake (12.3.3)
|
17
|
+
regexp_parser (1.7.1)
|
18
|
+
rexml (3.2.4)
|
19
|
+
rubocop (0.89.0)
|
20
|
+
parallel (~> 1.10)
|
21
|
+
parser (>= 2.7.1.1)
|
22
|
+
rainbow (>= 2.2.2, < 4.0)
|
23
|
+
regexp_parser (>= 1.7)
|
24
|
+
rexml
|
25
|
+
rubocop-ast (>= 0.1.0, < 1.0)
|
26
|
+
ruby-progressbar (~> 1.7)
|
27
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
28
|
+
rubocop-ast (0.3.0)
|
29
|
+
parser (>= 2.7.1.4)
|
30
|
+
ruby-progressbar (1.10.1)
|
31
|
+
unicode-display_width (1.7.0)
|
32
|
+
|
33
|
+
PLATFORMS
|
34
|
+
ruby
|
35
|
+
|
36
|
+
DEPENDENCIES
|
37
|
+
byebug (~> 11)
|
38
|
+
minitest (~> 5.0)
|
39
|
+
rake (~> 12.0)
|
40
|
+
regexgen!
|
41
|
+
rubocop (~> 0.89)
|
42
|
+
|
43
|
+
BUNDLED WITH
|
44
|
+
2.1.4
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 Aaron Madlon-Kay
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# regexgen
|
2
|
+
|
3
|
+
Generate regular expressions that match a set of strings.
|
4
|
+
|
5
|
+
This is a Ruby port of [@devongovett](https://github.com/devongovett/regexgen)'s
|
6
|
+
JavaScript [regexgen](https://github.com/devongovett/regexgen) package.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
gem 'regexgen'
|
14
|
+
```
|
15
|
+
|
16
|
+
And then execute:
|
17
|
+
|
18
|
+
$ bundle install
|
19
|
+
|
20
|
+
Or install it yourself as:
|
21
|
+
|
22
|
+
$ gem install regexgen
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'regexgen'
|
28
|
+
|
29
|
+
Regexgen.generate(['foobar', 'foobaz', 'foozap', 'fooza']) #=> /foo(?:zap?|ba[rz])/
|
30
|
+
```
|
31
|
+
|
32
|
+
## Unicode handling
|
33
|
+
|
34
|
+
Unlike the JavaScript version, this package does not do any special Unicode
|
35
|
+
handling because Ruby does it all for you. You are recommended to use a Unicode
|
36
|
+
encoding for your strings.
|
37
|
+
|
38
|
+
## How does it work?
|
39
|
+
|
40
|
+
Just like the JavaScript version:
|
41
|
+
|
42
|
+
1. Generate a [Trie](https://en.wikipedia.org/wiki/Trie) containing all of the
|
43
|
+
input strings. This is a tree structure where each edge represents a single
|
44
|
+
character. This removes redundancies at the start of the strings, but common
|
45
|
+
branches further down are not merged.
|
46
|
+
|
47
|
+
2. A trie can be seen as a tree-shaped deterministic finite automaton (DFA), so
|
48
|
+
DFA algorithms can be applied. In this case, we apply [Hopcroft's DFA
|
49
|
+
minimization
|
50
|
+
algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm)
|
51
|
+
to merge the nondistinguishable states.
|
52
|
+
|
53
|
+
3. Convert the resulting minimized DFA to a regular expression. This is done
|
54
|
+
using [Brzozowski's algebraic
|
55
|
+
method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392),
|
56
|
+
which is quite elegant. It expresses the DFA as a system of equations which
|
57
|
+
can be solved for a resulting regex. Along the way, some additional
|
58
|
+
optimizations are made, such as hoisting common substrings out of an
|
59
|
+
alternation, and using character class ranges. This produces an an [Abstract
|
60
|
+
Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST) for
|
61
|
+
the regex, which is then converted to a string and compiled to a Ruby
|
62
|
+
`Regexp` object.
|
63
|
+
|
64
|
+
## Development
|
65
|
+
|
66
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
67
|
+
`rake test` to run the tests. You can also run `bin/console` for an interactive
|
68
|
+
prompt that will allow you to experiment.
|
69
|
+
|
70
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To
|
71
|
+
release a new version, update the version number in `version.rb`, and then run
|
72
|
+
`bundle exec rake release`, which will create a git tag for the version, push
|
73
|
+
git commits and tags, and push the `.gem` file to
|
74
|
+
[rubygems.org](https://rubygems.org).
|
75
|
+
|
76
|
+
## Contributing
|
77
|
+
|
78
|
+
Bug reports and pull requests are welcome on GitHub at
|
79
|
+
https://github.com/amake/regexgen.
|
80
|
+
|
81
|
+
|
82
|
+
## License
|
83
|
+
|
84
|
+
The gem is available as open source under the terms of the [MIT
|
85
|
+
License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'regexgen'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/regexgen.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'regexgen/version'
|
4
|
+
require 'regexgen/trie'
|
5
|
+
|
6
|
+
# Generate regular expressions that match a set of strings
|
7
|
+
module Regexgen
|
8
|
+
class <<self
|
9
|
+
def generate(strings, flags = nil)
|
10
|
+
Trie.new.tap { |t| strings.each(&t.method(:add)) }
|
11
|
+
.to_regex(flags)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/regexgen/ast.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Regexgen
|
4
|
+
# Classes in the abstract syntax tree representation
|
5
|
+
module Ast
|
6
|
+
# Represents an alternation (e.g. `foo|bar`)
|
7
|
+
class Alternation
|
8
|
+
attr_reader :precedence, :options
|
9
|
+
|
10
|
+
def initialize(*options)
|
11
|
+
@precedence = 1
|
12
|
+
@options = flatten(options).sort { |a, b| b.length - a.length }
|
13
|
+
end
|
14
|
+
|
15
|
+
def length
|
16
|
+
@options[0].length
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s
|
20
|
+
@options.map { |o| Ast.parens(o, self) }.join('|')
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def flatten(options)
|
26
|
+
options.map { |option| option.is_a?(Alternation) ? flatten(option.options) : option }
|
27
|
+
.flatten
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Represents a character class (e.g. [0-9a-z])
|
32
|
+
class CharClass
|
33
|
+
attr_reader :precedence
|
34
|
+
|
35
|
+
def initialize(a, b)
|
36
|
+
@precedence = 1
|
37
|
+
@set = [a, b].flatten
|
38
|
+
end
|
39
|
+
|
40
|
+
def length
|
41
|
+
1
|
42
|
+
end
|
43
|
+
|
44
|
+
def single_character?
|
45
|
+
@set.none? { |c| c.ord > 0xffff }
|
46
|
+
end
|
47
|
+
|
48
|
+
def single_codepoint?
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_s
|
53
|
+
"[#{to_ranges_string}]"
|
54
|
+
end
|
55
|
+
|
56
|
+
def char_class
|
57
|
+
@set
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def to_ranges_string
|
63
|
+
to_ranges.map do |first, last|
|
64
|
+
if first == last
|
65
|
+
first
|
66
|
+
elsif first.ord.next == last.ord
|
67
|
+
"#{first}#{last}"
|
68
|
+
else
|
69
|
+
"#{first}-#{last}"
|
70
|
+
end
|
71
|
+
end.join
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_ranges
|
75
|
+
set = @set.sort
|
76
|
+
ranges = [[set.first, set.first]]
|
77
|
+
set.drop(1).each_with_object(ranges) do |c, acc|
|
78
|
+
if acc.last.last.ord.next == c.ord
|
79
|
+
acc.last[-1] = c
|
80
|
+
else
|
81
|
+
acc << [c, c]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Represents a concatenation (e.g. `foo`)
|
88
|
+
class Concatenation
|
89
|
+
attr_reader :precedence, :a, :b
|
90
|
+
|
91
|
+
def initialize(a, b)
|
92
|
+
@precedence = 2
|
93
|
+
@a = a
|
94
|
+
@b = b
|
95
|
+
end
|
96
|
+
|
97
|
+
def length
|
98
|
+
@a.length + @b.length
|
99
|
+
end
|
100
|
+
|
101
|
+
def to_s
|
102
|
+
Ast.parens(@a, self) + Ast.parens(@b, self)
|
103
|
+
end
|
104
|
+
|
105
|
+
def literal(side)
|
106
|
+
return @a.literal(side) if side == :start && @a.respond_to?(:literal)
|
107
|
+
|
108
|
+
@b.literal(side) if side == :end && @b.respond_to?(:literal)
|
109
|
+
end
|
110
|
+
|
111
|
+
def remove_substring(side, len)
|
112
|
+
a = @a
|
113
|
+
b = @b
|
114
|
+
a = @a.remove_substring(side, len) if side == :start && @a.respond_to?(:remove_substring)
|
115
|
+
b = @b.remove_substring(side, len) if side == :end && @b.respond_to?(:remove_substring)
|
116
|
+
|
117
|
+
return b if a.respond_to?(:empty?) && a.empty?
|
118
|
+
return a if b.respond_to?(:empty?) && b.empty?
|
119
|
+
|
120
|
+
Concatenation.new(a, b)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Represents a repetition (e.g. `a*` or `a?`)
|
125
|
+
class Repetition
|
126
|
+
attr_reader :precedence, :expr, :type
|
127
|
+
|
128
|
+
def initialize(expr, type)
|
129
|
+
@precedence = 3
|
130
|
+
@expr = expr
|
131
|
+
@type = type
|
132
|
+
end
|
133
|
+
|
134
|
+
def length
|
135
|
+
@expr.length
|
136
|
+
end
|
137
|
+
|
138
|
+
def to_s
|
139
|
+
Ast.parens(@expr, self) + @type
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Represents a literal (e.g. a string)
|
144
|
+
class Literal
|
145
|
+
attr_reader :precedence, :value
|
146
|
+
|
147
|
+
def initialize(value)
|
148
|
+
@precedence = 2
|
149
|
+
@value = value
|
150
|
+
end
|
151
|
+
|
152
|
+
def empty?
|
153
|
+
@value.empty?
|
154
|
+
end
|
155
|
+
|
156
|
+
def single_character?
|
157
|
+
length == 1
|
158
|
+
end
|
159
|
+
|
160
|
+
def single_codepoint?
|
161
|
+
@value.codepoints.length == 1
|
162
|
+
end
|
163
|
+
|
164
|
+
def length
|
165
|
+
@value.length
|
166
|
+
end
|
167
|
+
|
168
|
+
def to_s
|
169
|
+
Regexp.escape(@value)
|
170
|
+
end
|
171
|
+
|
172
|
+
def char_class
|
173
|
+
@value if single_codepoint?
|
174
|
+
end
|
175
|
+
|
176
|
+
def literal(_side = nil)
|
177
|
+
@value
|
178
|
+
end
|
179
|
+
|
180
|
+
def remove_substring(side, len)
|
181
|
+
return Literal.new(@value[len..]) if side == :start
|
182
|
+
return Literal.new(@value[0...(@value.length - len)]) if side == :end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class<<self
|
187
|
+
def parens(exp, parent)
|
188
|
+
str = exp.to_s
|
189
|
+
if exp.precedence < parent.precedence
|
190
|
+
unless exp.respond_to?(:single_character?) && exp.single_character?
|
191
|
+
return "(?:#{str})" unless exp.respond_to?(:single_codepoint?) && exp.single_codepoint?
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
str
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'regexgen/set'
|
4
|
+
|
5
|
+
module Regexgen
|
6
|
+
using SetUtil
|
7
|
+
|
8
|
+
class<<self
|
9
|
+
# Hopcroft's DSA minimization algorithm
|
10
|
+
# https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
|
11
|
+
#
|
12
|
+
# Largely ported from
|
13
|
+
# https://github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js
|
14
|
+
#
|
15
|
+
# P := {F, Q \ F};
|
16
|
+
# W := {F, Q \ F};
|
17
|
+
# while (W is not empty) do
|
18
|
+
# choose and remove a set A from W
|
19
|
+
# for each c in Σ do
|
20
|
+
# let X be the set of states for which a transition on c leads to a state in A
|
21
|
+
# for each set Y in P for which X ∩ Y is nonempty and Y \ X is nonempty do
|
22
|
+
# replace Y in P by the two sets X ∩ Y and Y \ X
|
23
|
+
# if Y is in W
|
24
|
+
# replace Y in W by the same two sets
|
25
|
+
# else
|
26
|
+
# if |X ∩ Y| <= |Y \ X|
|
27
|
+
# add X ∩ Y to W
|
28
|
+
# else
|
29
|
+
# add Y \ X to W
|
30
|
+
# end;
|
31
|
+
# end;
|
32
|
+
# end;
|
33
|
+
#
|
34
|
+
# Key:
|
35
|
+
#
|
36
|
+
# {...} is a Set (yes we have sets of sets here)
|
37
|
+
# A \ B is complement (A - B)
|
38
|
+
# Q is all states
|
39
|
+
# F is final states
|
40
|
+
# Σ is the DFA's alphabet
|
41
|
+
# c is a letter of the alphabet
|
42
|
+
# A ∩ B is intersection (A & B)
|
43
|
+
# |A| is the cardinality of A (A.size)
|
44
|
+
def minimize(root, alphabet)
|
45
|
+
states = root.visit
|
46
|
+
final_states = states.select(&:accepting).to_set
|
47
|
+
|
48
|
+
p = Set[states, final_states]
|
49
|
+
w = Set.new(p)
|
50
|
+
until w.empty?
|
51
|
+
a = w.shift
|
52
|
+
alphabet.each do |c|
|
53
|
+
x = states.each_with_object(Set.new) do |s, acc|
|
54
|
+
next unless s.transitions.key?(c)
|
55
|
+
|
56
|
+
acc.add(s) if a.include?(s.transitions[c])
|
57
|
+
end
|
58
|
+
p.to_a.each do |y|
|
59
|
+
intersection = x & y
|
60
|
+
next if intersection.empty?
|
61
|
+
|
62
|
+
complement = y - x
|
63
|
+
next if complement.empty?
|
64
|
+
|
65
|
+
p.replace(y, intersection, complement)
|
66
|
+
if w.include?(y)
|
67
|
+
w.replace(y, intersection, complement)
|
68
|
+
elsif intersection.size <= complement.size
|
69
|
+
w.add(intersection)
|
70
|
+
else
|
71
|
+
w.add(complement)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
new_states = Hash.new { |hash, key| hash[key] = State.new }
|
78
|
+
initial = nil
|
79
|
+
|
80
|
+
p.each do |s|
|
81
|
+
first = s.first
|
82
|
+
s_ = new_states[s]
|
83
|
+
first.transitions.each do |c, old|
|
84
|
+
s_.transitions[c] = new_states[p.find { |v| v.include?(old) }]
|
85
|
+
end
|
86
|
+
|
87
|
+
s_.accepting = first.accepting
|
88
|
+
|
89
|
+
initial = s_ if s.include?(root)
|
90
|
+
end
|
91
|
+
|
92
|
+
initial
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'regexgen/ast'
|
4
|
+
|
5
|
+
module Regexgen
|
6
|
+
class<<self
|
7
|
+
def to_regex(root)
|
8
|
+
states = root.visit.to_a
|
9
|
+
|
10
|
+
a = []
|
11
|
+
b = []
|
12
|
+
|
13
|
+
states.each_with_index do |a_, i|
|
14
|
+
b[i] = Ast::Literal.new('') if a_.accepting
|
15
|
+
|
16
|
+
a[i] = []
|
17
|
+
a_.transitions.each do |t, s|
|
18
|
+
j = states.index(s)
|
19
|
+
a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
(states.length - 1).downto(0) do |n|
|
24
|
+
if a[n][n]
|
25
|
+
b[n] = concat(star(a[n][n], b[n]))
|
26
|
+
(0...n).each do |j|
|
27
|
+
a[n][j] = concat(start(a[n][n]), a[n][j])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
(0...n).each do |i|
|
32
|
+
next unless a[i][n]
|
33
|
+
|
34
|
+
b[i] = union(b[i], concat(a[i][n], b[n]))
|
35
|
+
(0...n).each do |j|
|
36
|
+
a[i][j] = union(a[i][j], concat(a[i][n], a[n][j]))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
b[0].to_s
|
42
|
+
end
|
43
|
+
|
44
|
+
def star(exp)
|
45
|
+
Ast::Repetition.new(exp, '*') if exp
|
46
|
+
end
|
47
|
+
|
48
|
+
def union(a, b)
|
49
|
+
if a && b && a != b
|
50
|
+
res = nil
|
51
|
+
a, b, start = remove_common_substring(a, b, :start)
|
52
|
+
a, b, end_ = remove_common_substring(a, b, :end)
|
53
|
+
|
54
|
+
if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?)
|
55
|
+
res = Ast::Repetition.new(a.empty? ? b : a, '?')
|
56
|
+
elsif a.is_a?(Ast::Repetition) && a.type == '?'
|
57
|
+
res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?')
|
58
|
+
elsif b.is_a?(Ast::Repetition) && b.type == '?'
|
59
|
+
res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?')
|
60
|
+
else
|
61
|
+
ac = a.char_class if a.respond_to?(:char_class)
|
62
|
+
bc = b.char_class if b.respond_to?(:char_class)
|
63
|
+
res = if ac && bc
|
64
|
+
Ast::CharClass.new(ac, bc)
|
65
|
+
else
|
66
|
+
Ast::Alternation.new(a, b)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty?
|
71
|
+
|
72
|
+
res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty?
|
73
|
+
|
74
|
+
return res
|
75
|
+
end
|
76
|
+
|
77
|
+
a || b
|
78
|
+
end
|
79
|
+
|
80
|
+
def remove_common_substring(a, b, side)
|
81
|
+
al = a.literal(side) if a.respond_to?(:literal)
|
82
|
+
bl = b.literal(side) if b.respond_to?(:literal)
|
83
|
+
return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty?
|
84
|
+
|
85
|
+
s = common_substring(al, bl, side)
|
86
|
+
return [a, b, ''] if s.empty?
|
87
|
+
|
88
|
+
a = a.remove_substring(side, s.length)
|
89
|
+
b = b.remove_substring(side, s.length)
|
90
|
+
|
91
|
+
[a, b, s]
|
92
|
+
end
|
93
|
+
|
94
|
+
def common_substring(a, b, side)
|
95
|
+
dir = side == :start ? 1 : -1
|
96
|
+
a = a.chars
|
97
|
+
b = b.chars
|
98
|
+
ai = dir == 1 ? 0 : a.length - 1
|
99
|
+
ae = dir == 1 ? a.length : -1
|
100
|
+
bi = dir == 1 ? 0 : b.length - 1
|
101
|
+
be = dir == 1 ? b.length : -1
|
102
|
+
res = ''
|
103
|
+
|
104
|
+
while ai != ae && bi != be && a[ai] == b[bi]
|
105
|
+
if dir == 1
|
106
|
+
res += a[ai]
|
107
|
+
else
|
108
|
+
res = a[ai] + res
|
109
|
+
end
|
110
|
+
ai += dir
|
111
|
+
bi += dir
|
112
|
+
end
|
113
|
+
|
114
|
+
res
|
115
|
+
end
|
116
|
+
|
117
|
+
def concat(a, b)
|
118
|
+
return unless a && b
|
119
|
+
|
120
|
+
return b if a.respond_to?(:empty?) && a.empty?
|
121
|
+
return a if b.respond_to?(:empty?) && b.empty?
|
122
|
+
|
123
|
+
return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal)
|
124
|
+
|
125
|
+
if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal)
|
126
|
+
return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b)
|
127
|
+
end
|
128
|
+
|
129
|
+
if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal)
|
130
|
+
return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value))
|
131
|
+
end
|
132
|
+
|
133
|
+
Ast::Concatenation.new(a, b)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/regexgen/set.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Regexgen
|
4
|
+
module SetUtil
|
5
|
+
refine Set do
|
6
|
+
def shift
|
7
|
+
item = first
|
8
|
+
delete(first)
|
9
|
+
item
|
10
|
+
end
|
11
|
+
|
12
|
+
def replace(search, *replacements)
|
13
|
+
raise unless delete?(search)
|
14
|
+
|
15
|
+
merge(replacements)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Regexgen
|
4
|
+
class State
|
5
|
+
attr_accessor :accepting
|
6
|
+
attr_reader :transitions
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@accepting = false
|
10
|
+
@transitions = Hash.new { |hash, key| hash[key] = State.new }
|
11
|
+
end
|
12
|
+
|
13
|
+
def visit(visited = Set.new)
|
14
|
+
return visited if visited.include?(self)
|
15
|
+
|
16
|
+
visited.add(self)
|
17
|
+
@transitions.each_value { |state| state.visit(visited) }
|
18
|
+
visited
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_h
|
22
|
+
@transitions.transform_values(&:to_h).tap do |h|
|
23
|
+
h[''] = nil if @accepting
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
sigil = @accepting ? '*' : ''
|
29
|
+
"#{sigil}#{to_h}"
|
30
|
+
end
|
31
|
+
|
32
|
+
alias inspect to_s
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'regexgen/state'
|
4
|
+
require 'regexgen/minimize'
|
5
|
+
require 'regexgen/regex'
|
6
|
+
|
7
|
+
module Regexgen
|
8
|
+
class Trie
|
9
|
+
attr_reader :root, :alphabet
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@alphabet = Set.new
|
13
|
+
@root = State.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def add(str)
|
17
|
+
node = @root
|
18
|
+
str.each_char do |char|
|
19
|
+
@alphabet.add(char)
|
20
|
+
node = node.transitions[char]
|
21
|
+
end
|
22
|
+
node.accepting = true
|
23
|
+
end
|
24
|
+
|
25
|
+
def add_all(strs)
|
26
|
+
strs.each(&method(:add))
|
27
|
+
end
|
28
|
+
|
29
|
+
def minimize
|
30
|
+
Regexgen.minimize(@root, @alphabet)
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_s
|
34
|
+
Regexgen.to_regex(minimize)
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_regex(flags = nil)
|
38
|
+
flags_i = 0
|
39
|
+
flags_i |= Regexp::EXTENDED if flags&.include?('x')
|
40
|
+
flags_i |= Regexp::IGNORECASE if flags&.include?('i')
|
41
|
+
flags_i |= Regexp::MULTILINE if flags&.include?('m')
|
42
|
+
Regexp.new(to_s, flags_i)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/regexgen.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/regexgen/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'regexgen'
|
7
|
+
spec.version = Regexgen::VERSION
|
8
|
+
spec.authors = ['Aaron Madlon-Kay']
|
9
|
+
spec.email = ['aaron@madlon-kay.com']
|
10
|
+
|
11
|
+
spec.summary = 'Generate a minimal regex matching a set of strings'
|
12
|
+
spec.homepage = 'https://github.com/aaron/regexgen-ruby'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
15
|
+
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
17
|
+
spec.metadata['source_code_uri'] = 'https://github.com/aaron/regexgen-ruby.git'
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
spec.bindir = 'exe'
|
25
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
26
|
+
spec.require_paths = ['lib']
|
27
|
+
|
28
|
+
spec.add_development_dependency 'byebug', '~> 11'
|
29
|
+
spec.add_development_dependency 'rubocop', '~> 0.89'
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: regexgen
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aaron Madlon-Kay
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-08-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: byebug
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '11'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '11'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rubocop
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.89'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.89'
|
41
|
+
description:
|
42
|
+
email:
|
43
|
+
- aaron@madlon-kay.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".github/workflows/main.yml"
|
49
|
+
- ".gitignore"
|
50
|
+
- ".rubocop.yml"
|
51
|
+
- ".rubocop_todo.yml"
|
52
|
+
- ".ruby-version"
|
53
|
+
- ".travis.yml"
|
54
|
+
- Gemfile
|
55
|
+
- Gemfile.lock
|
56
|
+
- LICENSE.txt
|
57
|
+
- README.md
|
58
|
+
- Rakefile
|
59
|
+
- bin/console
|
60
|
+
- bin/setup
|
61
|
+
- lib/regexgen.rb
|
62
|
+
- lib/regexgen/ast.rb
|
63
|
+
- lib/regexgen/minimize.rb
|
64
|
+
- lib/regexgen/regex.rb
|
65
|
+
- lib/regexgen/set.rb
|
66
|
+
- lib/regexgen/state.rb
|
67
|
+
- lib/regexgen/trie.rb
|
68
|
+
- lib/regexgen/version.rb
|
69
|
+
- regexgen.gemspec
|
70
|
+
homepage: https://github.com/aaron/regexgen-ruby
|
71
|
+
licenses:
|
72
|
+
- MIT
|
73
|
+
metadata:
|
74
|
+
homepage_uri: https://github.com/aaron/regexgen-ruby
|
75
|
+
source_code_uri: https://github.com/aaron/regexgen-ruby.git
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 2.3.0
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubygems_version: 3.0.3
|
92
|
+
signing_key:
|
93
|
+
specification_version: 4
|
94
|
+
summary: Generate a minimal regex matching a set of strings
|
95
|
+
test_files: []
|