regexgen 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 46f71792a530cbee9556f1d401be2095c27b89d177c8cfbb0a5a2bacbdbb4c43
4
+ data.tar.gz: 749f8d8e47d1539239e47d7c19ad688fd7056d75278a94a7dd555248a3649f9b
5
+ SHA512:
6
+ metadata.gz: 03f4f12753dee716dadde7e6b3e244356327506ec598a5ffec93427cfb42dd20ff2732005a8907915d7155300fbe4511aaf857b19eb123d35a8463598e29d3e8
7
+ data.tar.gz: 89490be6ae578fde9e753221afdaa1d83767af9e77f638b6d788d3b632f22d81ac608740d7e22c15edd9145d706aee3f333343e013e1bc5b9cf4ab529e568f5e
@@ -0,0 +1,23 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ pull_request:
7
+ branches: [ master ]
8
+
9
+ jobs:
10
+ build:
11
+
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v2
16
+ - name: Setup Ruby, JRuby and TruffleRuby
17
+ uses: ruby/setup-ruby@v1.40.0
18
+ - name: Install dependencies
19
+ run: bundle install
20
+ - name: Run tests
21
+ run: bundle exec rake test
22
+ - name: Lint
23
+ run: bundle exec rubocop
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ .byebug_history
@@ -0,0 +1,84 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ Style/AsciiComments:
4
+ Enabled: false
5
+
6
+ Naming/MethodParameterName:
7
+ Exclude:
8
+ - 'lib/regexgen/regex.rb'
9
+ - 'lib/regexgen/ast.rb'
10
+
11
+ Layout/EmptyLinesAroundAttributeAccessor:
12
+ Enabled: true
13
+ Layout/SpaceAroundMethodCallOperator:
14
+ Enabled: true
15
+ Lint/BinaryOperatorWithIdenticalOperands:
16
+ Enabled: true
17
+ Lint/DeprecatedOpenSSLConstant:
18
+ Enabled: true
19
+ Lint/DuplicateElsifCondition:
20
+ Enabled: true
21
+ Lint/DuplicateRescueException:
22
+ Enabled: true
23
+ Lint/EmptyConditionalBody:
24
+ Enabled: true
25
+ Lint/FloatComparison:
26
+ Enabled: true
27
+ Lint/MissingSuper:
28
+ Enabled: true
29
+ Lint/MixedRegexpCaptureTypes:
30
+ Enabled: true
31
+ Lint/OutOfRangeRegexpRef:
32
+ Enabled: true
33
+ Lint/RaiseException:
34
+ Enabled: true
35
+ Lint/SelfAssignment:
36
+ Enabled: true
37
+ Lint/StructNewOverride:
38
+ Enabled: true
39
+ Lint/TopLevelReturnWithArgument:
40
+ Enabled: true
41
+ Lint/UnreachableLoop:
42
+ Enabled: true
43
+ Style/AccessorGrouping:
44
+ Enabled: true
45
+ Style/ArrayCoercion:
46
+ Enabled: true
47
+ Style/BisectedAttrAccessor:
48
+ Enabled: true
49
+ Style/CaseLikeIf:
50
+ Enabled: true
51
+ Style/ExplicitBlockArgument:
52
+ Enabled: true
53
+ Style/ExponentialNotation:
54
+ Enabled: true
55
+ Style/GlobalStdStream:
56
+ Enabled: true
57
+ Style/HashAsLastArrayItem:
58
+ Enabled: true
59
+ Style/HashEachMethods:
60
+ Enabled: true
61
+ Style/HashLikeCase:
62
+ Enabled: true
63
+ Style/HashTransformKeys:
64
+ Enabled: true
65
+ Style/HashTransformValues:
66
+ Enabled: true
67
+ Style/OptionalBooleanParameter:
68
+ Enabled: true
69
+ Style/RedundantAssignment:
70
+ Enabled: true
71
+ Style/RedundantFetchBlock:
72
+ Enabled: true
73
+ Style/RedundantFileExtensionInRequire:
74
+ Enabled: true
75
+ Style/RedundantRegexpCharacterClass:
76
+ Enabled: true
77
+ Style/RedundantRegexpEscape:
78
+ Enabled: true
79
+ Style/SingleArgumentDig:
80
+ Enabled: true
81
+ Style/SlicingWithRange:
82
+ Enabled: true
83
+ Style/StringConcatenation:
84
+ Enabled: true
@@ -0,0 +1,43 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2020-08-10 13:40:30 UTC using RuboCop version 0.89.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 5
10
+ # Configuration parameters: IgnoredMethods.
11
+ Metrics/AbcSize:
12
+ Max: 66
13
+
14
+ # Offense count: 7
15
+ # Configuration parameters: IgnoredMethods.
16
+ Metrics/CyclomaticComplexity:
17
+ Max: 22
18
+
19
+ # Offense count: 6
20
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods.
21
+ Metrics/MethodLength:
22
+ Max: 39
23
+
24
+ # Offense count: 1
25
+ # Configuration parameters: CountComments, CountAsOne.
26
+ Metrics/ModuleLength:
27
+ Max: 103
28
+
29
+ # Offense count: 6
30
+ # Configuration parameters: IgnoredMethods.
31
+ Metrics/PerceivedComplexity:
32
+ Max: 24
33
+
34
+ # Offense count: 5
35
+ Style/Documentation:
36
+ Exclude:
37
+ - 'spec/**/*'
38
+ - 'test/**/*'
39
+ - 'lib/regexgen/minimize.rb'
40
+ - 'lib/regexgen/regex.rb'
41
+ - 'lib/regexgen/set.rb'
42
+ - 'lib/regexgen/state.rb'
43
+ - 'lib/regexgen/trie.rb'
@@ -0,0 +1 @@
1
+ 2.6.5
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.6.5
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in regexgen.gemspec
6
+ gemspec
7
+
8
+ gem 'minitest', '~> 5.0'
9
+ gem 'rake', '~> 12.0'
@@ -0,0 +1,44 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ regexgen (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.1)
10
+ byebug (11.1.3)
11
+ minitest (5.14.1)
12
+ parallel (1.19.2)
13
+ parser (2.7.1.4)
14
+ ast (~> 2.4.1)
15
+ rainbow (3.0.0)
16
+ rake (12.3.3)
17
+ regexp_parser (1.7.1)
18
+ rexml (3.2.4)
19
+ rubocop (0.89.0)
20
+ parallel (~> 1.10)
21
+ parser (>= 2.7.1.1)
22
+ rainbow (>= 2.2.2, < 4.0)
23
+ regexp_parser (>= 1.7)
24
+ rexml
25
+ rubocop-ast (>= 0.1.0, < 1.0)
26
+ ruby-progressbar (~> 1.7)
27
+ unicode-display_width (>= 1.4.0, < 2.0)
28
+ rubocop-ast (0.3.0)
29
+ parser (>= 2.7.1.4)
30
+ ruby-progressbar (1.10.1)
31
+ unicode-display_width (1.7.0)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ byebug (~> 11)
38
+ minitest (~> 5.0)
39
+ rake (~> 12.0)
40
+ regexgen!
41
+ rubocop (~> 0.89)
42
+
43
+ BUNDLED WITH
44
+ 2.1.4
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Aaron Madlon-Kay
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,85 @@
1
+ # regexgen
2
+
3
+ Generate regular expressions that match a set of strings.
4
+
5
+ This is a Ruby port of [@devongovett](https://github.com/devongovett/regexgen)'s
6
+ JavaScript [regexgen](https://github.com/devongovett/regexgen) package.
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem 'regexgen'
14
+ ```
15
+
16
+ And then execute:
17
+
18
+ $ bundle install
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install regexgen
23
+
24
+ ## Usage
25
+
26
+ ```ruby
27
+ require 'regexgen'
28
+
29
+ Regexgen.generate(['foobar', 'foobaz', 'foozap', 'fooza']) #=> /foo(?:zap?|ba[rz])/
30
+ ```
31
+
32
+ ## Unicode handling
33
+
34
+ Unlike the JavaScript version, this package does not do any special Unicode
35
+ handling because Ruby does it all for you. You are recommended to use a Unicode
36
+ encoding for your strings.
37
+
38
+ ## How does it work?
39
+
40
+ Just like the JavaScript version:
41
+
42
+ 1. Generate a [Trie](https://en.wikipedia.org/wiki/Trie) containing all of the
43
+ input strings. This is a tree structure where each edge represents a single
44
+ character. This removes redundancies at the start of the strings, but common
45
+ branches further down are not merged.
46
+
47
+ 2. A trie can be seen as a tree-shaped deterministic finite automaton (DFA), so
48
+ DFA algorithms can be applied. In this case, we apply [Hopcroft's DFA
49
+ minimization
50
+ algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm)
51
+ to merge the nondistinguishable states.
52
+
53
+ 3. Convert the resulting minimized DFA to a regular expression. This is done
54
+ using [Brzozowski's algebraic
55
+ method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392),
56
+ which is quite elegant. It expresses the DFA as a system of equations which
57
+ can be solved for a resulting regex. Along the way, some additional
58
+ optimizations are made, such as hoisting common substrings out of an
59
+ alternation, and using character class ranges. This produces an an [Abstract
60
+ Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST) for
61
+ the regex, which is then converted to a string and compiled to a Ruby
62
+ `Regexp` object.
63
+
64
+ ## Development
65
+
66
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
67
+ `rake test` to run the tests. You can also run `bin/console` for an interactive
68
+ prompt that will allow you to experiment.
69
+
70
+ To install this gem onto your local machine, run `bundle exec rake install`. To
71
+ release a new version, update the version number in `version.rb`, and then run
72
+ `bundle exec rake release`, which will create a git tag for the version, push
73
+ git commits and tags, and push the `.gem` file to
74
+ [rubygems.org](https://rubygems.org).
75
+
76
+ ## Contributing
77
+
78
+ Bug reports and pull requests are welcome on GitHub at
79
+ https://github.com/amake/regexgen.
80
+
81
+
82
+ ## License
83
+
84
+ The gem is available as open source under the terms of the [MIT
85
+ License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rake/testtask'
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/**/*_test.rb']
10
+ end
11
+
12
+ task default: :test
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'regexgen'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/version'
4
+ require 'regexgen/trie'
5
+
6
+ # Generate regular expressions that match a set of strings
7
+ module Regexgen
8
+ class <<self
9
+ def generate(strings, flags = nil)
10
+ Trie.new.tap { |t| strings.each(&t.method(:add)) }
11
+ .to_regex(flags)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ # Classes in the abstract syntax tree representation
5
+ module Ast
6
+ # Represents an alternation (e.g. `foo|bar`)
7
+ class Alternation
8
+ attr_reader :precedence, :options
9
+
10
+ def initialize(*options)
11
+ @precedence = 1
12
+ @options = flatten(options).sort { |a, b| b.length - a.length }
13
+ end
14
+
15
+ def length
16
+ @options[0].length
17
+ end
18
+
19
+ def to_s
20
+ @options.map { |o| Ast.parens(o, self) }.join('|')
21
+ end
22
+
23
+ private
24
+
25
+ def flatten(options)
26
+ options.map { |option| option.is_a?(Alternation) ? flatten(option.options) : option }
27
+ .flatten
28
+ end
29
+ end
30
+
31
+ # Represents a character class (e.g. [0-9a-z])
32
+ class CharClass
33
+ attr_reader :precedence
34
+
35
+ def initialize(a, b)
36
+ @precedence = 1
37
+ @set = [a, b].flatten
38
+ end
39
+
40
+ def length
41
+ 1
42
+ end
43
+
44
+ def single_character?
45
+ @set.none? { |c| c.ord > 0xffff }
46
+ end
47
+
48
+ def single_codepoint?
49
+ true
50
+ end
51
+
52
+ def to_s
53
+ "[#{to_ranges_string}]"
54
+ end
55
+
56
+ def char_class
57
+ @set
58
+ end
59
+
60
+ private
61
+
62
+ def to_ranges_string
63
+ to_ranges.map do |first, last|
64
+ if first == last
65
+ first
66
+ elsif first.ord.next == last.ord
67
+ "#{first}#{last}"
68
+ else
69
+ "#{first}-#{last}"
70
+ end
71
+ end.join
72
+ end
73
+
74
+ def to_ranges
75
+ set = @set.sort
76
+ ranges = [[set.first, set.first]]
77
+ set.drop(1).each_with_object(ranges) do |c, acc|
78
+ if acc.last.last.ord.next == c.ord
79
+ acc.last[-1] = c
80
+ else
81
+ acc << [c, c]
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ # Represents a concatenation (e.g. `foo`)
88
+ class Concatenation
89
+ attr_reader :precedence, :a, :b
90
+
91
+ def initialize(a, b)
92
+ @precedence = 2
93
+ @a = a
94
+ @b = b
95
+ end
96
+
97
+ def length
98
+ @a.length + @b.length
99
+ end
100
+
101
+ def to_s
102
+ Ast.parens(@a, self) + Ast.parens(@b, self)
103
+ end
104
+
105
+ def literal(side)
106
+ return @a.literal(side) if side == :start && @a.respond_to?(:literal)
107
+
108
+ @b.literal(side) if side == :end && @b.respond_to?(:literal)
109
+ end
110
+
111
+ def remove_substring(side, len)
112
+ a = @a
113
+ b = @b
114
+ a = @a.remove_substring(side, len) if side == :start && @a.respond_to?(:remove_substring)
115
+ b = @b.remove_substring(side, len) if side == :end && @b.respond_to?(:remove_substring)
116
+
117
+ return b if a.respond_to?(:empty?) && a.empty?
118
+ return a if b.respond_to?(:empty?) && b.empty?
119
+
120
+ Concatenation.new(a, b)
121
+ end
122
+ end
123
+
124
+ # Represents a repetition (e.g. `a*` or `a?`)
125
+ class Repetition
126
+ attr_reader :precedence, :expr, :type
127
+
128
+ def initialize(expr, type)
129
+ @precedence = 3
130
+ @expr = expr
131
+ @type = type
132
+ end
133
+
134
+ def length
135
+ @expr.length
136
+ end
137
+
138
+ def to_s
139
+ Ast.parens(@expr, self) + @type
140
+ end
141
+ end
142
+
143
+ # Represents a literal (e.g. a string)
144
+ class Literal
145
+ attr_reader :precedence, :value
146
+
147
+ def initialize(value)
148
+ @precedence = 2
149
+ @value = value
150
+ end
151
+
152
+ def empty?
153
+ @value.empty?
154
+ end
155
+
156
+ def single_character?
157
+ length == 1
158
+ end
159
+
160
+ def single_codepoint?
161
+ @value.codepoints.length == 1
162
+ end
163
+
164
+ def length
165
+ @value.length
166
+ end
167
+
168
+ def to_s
169
+ Regexp.escape(@value)
170
+ end
171
+
172
+ def char_class
173
+ @value if single_codepoint?
174
+ end
175
+
176
+ def literal(_side = nil)
177
+ @value
178
+ end
179
+
180
+ def remove_substring(side, len)
181
+ return Literal.new(@value[len..]) if side == :start
182
+ return Literal.new(@value[0...(@value.length - len)]) if side == :end
183
+ end
184
+ end
185
+
186
+ class<<self
187
+ def parens(exp, parent)
188
+ str = exp.to_s
189
+ if exp.precedence < parent.precedence
190
+ unless exp.respond_to?(:single_character?) && exp.single_character?
191
+ return "(?:#{str})" unless exp.respond_to?(:single_codepoint?) && exp.single_codepoint?
192
+ end
193
+ end
194
+
195
+ str
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/set'
4
+
5
+ module Regexgen
6
+ using SetUtil
7
+
8
+ class<<self
9
+ # Hopcroft's DSA minimization algorithm
10
+ # https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
11
+ #
12
+ # Largely ported from
13
+ # https://github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js
14
+ #
15
+ # P := {F, Q \ F};
16
+ # W := {F, Q \ F};
17
+ # while (W is not empty) do
18
+ # choose and remove a set A from W
19
+ # for each c in Σ do
20
+ # let X be the set of states for which a transition on c leads to a state in A
21
+ # for each set Y in P for which X ∩ Y is nonempty and Y \ X is nonempty do
22
+ # replace Y in P by the two sets X ∩ Y and Y \ X
23
+ # if Y is in W
24
+ # replace Y in W by the same two sets
25
+ # else
26
+ # if |X ∩ Y| <= |Y \ X|
27
+ # add X ∩ Y to W
28
+ # else
29
+ # add Y \ X to W
30
+ # end;
31
+ # end;
32
+ # end;
33
+ #
34
+ # Key:
35
+ #
36
+ # {...} is a Set (yes we have sets of sets here)
37
+ # A \ B is complement (A - B)
38
+ # Q is all states
39
+ # F is final states
40
+ # Σ is the DFA's alphabet
41
+ # c is a letter of the alphabet
42
+ # A ∩ B is intersection (A & B)
43
+ # |A| is the cardinality of A (A.size)
44
+ def minimize(root, alphabet)
45
+ states = root.visit
46
+ final_states = states.select(&:accepting).to_set
47
+
48
+ p = Set[states, final_states]
49
+ w = Set.new(p)
50
+ until w.empty?
51
+ a = w.shift
52
+ alphabet.each do |c|
53
+ x = states.each_with_object(Set.new) do |s, acc|
54
+ next unless s.transitions.key?(c)
55
+
56
+ acc.add(s) if a.include?(s.transitions[c])
57
+ end
58
+ p.to_a.each do |y|
59
+ intersection = x & y
60
+ next if intersection.empty?
61
+
62
+ complement = y - x
63
+ next if complement.empty?
64
+
65
+ p.replace(y, intersection, complement)
66
+ if w.include?(y)
67
+ w.replace(y, intersection, complement)
68
+ elsif intersection.size <= complement.size
69
+ w.add(intersection)
70
+ else
71
+ w.add(complement)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ new_states = Hash.new { |hash, key| hash[key] = State.new }
78
+ initial = nil
79
+
80
+ p.each do |s|
81
+ first = s.first
82
+ s_ = new_states[s]
83
+ first.transitions.each do |c, old|
84
+ s_.transitions[c] = new_states[p.find { |v| v.include?(old) }]
85
+ end
86
+
87
+ s_.accepting = first.accepting
88
+
89
+ initial = s_ if s.include?(root)
90
+ end
91
+
92
+ initial
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/ast'
4
+
5
+ module Regexgen
6
+ class<<self
7
+ def to_regex(root)
8
+ states = root.visit.to_a
9
+
10
+ a = []
11
+ b = []
12
+
13
+ states.each_with_index do |a_, i|
14
+ b[i] = Ast::Literal.new('') if a_.accepting
15
+
16
+ a[i] = []
17
+ a_.transitions.each do |t, s|
18
+ j = states.index(s)
19
+ a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t)
20
+ end
21
+ end
22
+
23
+ (states.length - 1).downto(0) do |n|
24
+ if a[n][n]
25
+ b[n] = concat(star(a[n][n], b[n]))
26
+ (0...n).each do |j|
27
+ a[n][j] = concat(start(a[n][n]), a[n][j])
28
+ end
29
+ end
30
+
31
+ (0...n).each do |i|
32
+ next unless a[i][n]
33
+
34
+ b[i] = union(b[i], concat(a[i][n], b[n]))
35
+ (0...n).each do |j|
36
+ a[i][j] = union(a[i][j], concat(a[i][n], a[n][j]))
37
+ end
38
+ end
39
+ end
40
+
41
+ b[0].to_s
42
+ end
43
+
44
+ def star(exp)
45
+ Ast::Repetition.new(exp, '*') if exp
46
+ end
47
+
48
+ def union(a, b)
49
+ if a && b && a != b
50
+ res = nil
51
+ a, b, start = remove_common_substring(a, b, :start)
52
+ a, b, end_ = remove_common_substring(a, b, :end)
53
+
54
+ if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?)
55
+ res = Ast::Repetition.new(a.empty? ? b : a, '?')
56
+ elsif a.is_a?(Ast::Repetition) && a.type == '?'
57
+ res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?')
58
+ elsif b.is_a?(Ast::Repetition) && b.type == '?'
59
+ res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?')
60
+ else
61
+ ac = a.char_class if a.respond_to?(:char_class)
62
+ bc = b.char_class if b.respond_to?(:char_class)
63
+ res = if ac && bc
64
+ Ast::CharClass.new(ac, bc)
65
+ else
66
+ Ast::Alternation.new(a, b)
67
+ end
68
+ end
69
+
70
+ res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty?
71
+
72
+ res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty?
73
+
74
+ return res
75
+ end
76
+
77
+ a || b
78
+ end
79
+
80
+ def remove_common_substring(a, b, side)
81
+ al = a.literal(side) if a.respond_to?(:literal)
82
+ bl = b.literal(side) if b.respond_to?(:literal)
83
+ return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty?
84
+
85
+ s = common_substring(al, bl, side)
86
+ return [a, b, ''] if s.empty?
87
+
88
+ a = a.remove_substring(side, s.length)
89
+ b = b.remove_substring(side, s.length)
90
+
91
+ [a, b, s]
92
+ end
93
+
94
+ def common_substring(a, b, side)
95
+ dir = side == :start ? 1 : -1
96
+ a = a.chars
97
+ b = b.chars
98
+ ai = dir == 1 ? 0 : a.length - 1
99
+ ae = dir == 1 ? a.length : -1
100
+ bi = dir == 1 ? 0 : b.length - 1
101
+ be = dir == 1 ? b.length : -1
102
+ res = ''
103
+
104
+ while ai != ae && bi != be && a[ai] == b[bi]
105
+ if dir == 1
106
+ res += a[ai]
107
+ else
108
+ res = a[ai] + res
109
+ end
110
+ ai += dir
111
+ bi += dir
112
+ end
113
+
114
+ res
115
+ end
116
+
117
+ def concat(a, b)
118
+ return unless a && b
119
+
120
+ return b if a.respond_to?(:empty?) && a.empty?
121
+ return a if b.respond_to?(:empty?) && b.empty?
122
+
123
+ return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal)
124
+
125
+ if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal)
126
+ return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b)
127
+ end
128
+
129
+ if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal)
130
+ return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value))
131
+ end
132
+
133
+ Ast::Concatenation.new(a, b)
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ module SetUtil
5
+ refine Set do
6
+ def shift
7
+ item = first
8
+ delete(first)
9
+ item
10
+ end
11
+
12
+ def replace(search, *replacements)
13
+ raise unless delete?(search)
14
+
15
+ merge(replacements)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ class State
5
+ attr_accessor :accepting
6
+ attr_reader :transitions
7
+
8
+ def initialize
9
+ @accepting = false
10
+ @transitions = Hash.new { |hash, key| hash[key] = State.new }
11
+ end
12
+
13
+ def visit(visited = Set.new)
14
+ return visited if visited.include?(self)
15
+
16
+ visited.add(self)
17
+ @transitions.each_value { |state| state.visit(visited) }
18
+ visited
19
+ end
20
+
21
+ def to_h
22
+ @transitions.transform_values(&:to_h).tap do |h|
23
+ h[''] = nil if @accepting
24
+ end
25
+ end
26
+
27
+ def to_s
28
+ sigil = @accepting ? '*' : ''
29
+ "#{sigil}#{to_h}"
30
+ end
31
+
32
+ alias inspect to_s
33
+ end
34
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/state'
4
+ require 'regexgen/minimize'
5
+ require 'regexgen/regex'
6
+
7
+ module Regexgen
8
+ class Trie
9
+ attr_reader :root, :alphabet
10
+
11
+ def initialize
12
+ @alphabet = Set.new
13
+ @root = State.new
14
+ end
15
+
16
+ def add(str)
17
+ node = @root
18
+ str.each_char do |char|
19
+ @alphabet.add(char)
20
+ node = node.transitions[char]
21
+ end
22
+ node.accepting = true
23
+ end
24
+
25
+ def add_all(strs)
26
+ strs.each(&method(:add))
27
+ end
28
+
29
+ def minimize
30
+ Regexgen.minimize(@root, @alphabet)
31
+ end
32
+
33
+ def to_s
34
+ Regexgen.to_regex(minimize)
35
+ end
36
+
37
+ def to_regex(flags = nil)
38
+ flags_i = 0
39
+ flags_i |= Regexp::EXTENDED if flags&.include?('x')
40
+ flags_i |= Regexp::IGNORECASE if flags&.include?('i')
41
+ flags_i |= Regexp::MULTILINE if flags&.include?('m')
42
+ Regexp.new(to_s, flags_i)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/regexgen/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'regexgen'
7
+ spec.version = Regexgen::VERSION
8
+ spec.authors = ['Aaron Madlon-Kay']
9
+ spec.email = ['aaron@madlon-kay.com']
10
+
11
+ spec.summary = 'Generate a minimal regex matching a set of strings'
12
+ spec.homepage = 'https://github.com/aaron/regexgen-ruby'
13
+ spec.license = 'MIT'
14
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = 'https://github.com/aaron/regexgen-ruby.git'
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ end
24
+ spec.bindir = 'exe'
25
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_development_dependency 'byebug', '~> 11'
29
+ spec.add_development_dependency 'rubocop', '~> 0.89'
30
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regexgen
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Madlon-Kay
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-08-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: byebug
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.89'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.89'
41
+ description:
42
+ email:
43
+ - aaron@madlon-kay.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".github/workflows/main.yml"
49
+ - ".gitignore"
50
+ - ".rubocop.yml"
51
+ - ".rubocop_todo.yml"
52
+ - ".ruby-version"
53
+ - ".travis.yml"
54
+ - Gemfile
55
+ - Gemfile.lock
56
+ - LICENSE.txt
57
+ - README.md
58
+ - Rakefile
59
+ - bin/console
60
+ - bin/setup
61
+ - lib/regexgen.rb
62
+ - lib/regexgen/ast.rb
63
+ - lib/regexgen/minimize.rb
64
+ - lib/regexgen/regex.rb
65
+ - lib/regexgen/set.rb
66
+ - lib/regexgen/state.rb
67
+ - lib/regexgen/trie.rb
68
+ - lib/regexgen/version.rb
69
+ - regexgen.gemspec
70
+ homepage: https://github.com/aaron/regexgen-ruby
71
+ licenses:
72
+ - MIT
73
+ metadata:
74
+ homepage_uri: https://github.com/aaron/regexgen-ruby
75
+ source_code_uri: https://github.com/aaron/regexgen-ruby.git
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 2.3.0
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.0.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Generate a minimal regex matching a set of strings
95
+ test_files: []