regexgen 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 46f71792a530cbee9556f1d401be2095c27b89d177c8cfbb0a5a2bacbdbb4c43
4
+ data.tar.gz: 749f8d8e47d1539239e47d7c19ad688fd7056d75278a94a7dd555248a3649f9b
5
+ SHA512:
6
+ metadata.gz: 03f4f12753dee716dadde7e6b3e244356327506ec598a5ffec93427cfb42dd20ff2732005a8907915d7155300fbe4511aaf857b19eb123d35a8463598e29d3e8
7
+ data.tar.gz: 89490be6ae578fde9e753221afdaa1d83767af9e77f638b6d788d3b632f22d81ac608740d7e22c15edd9145d706aee3f333343e013e1bc5b9cf4ab529e568f5e
@@ -0,0 +1,23 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ pull_request:
7
+ branches: [ master ]
8
+
9
+ jobs:
10
+ build:
11
+
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v2
16
+ - name: Setup Ruby, JRuby and TruffleRuby
17
+ uses: ruby/setup-ruby@v1.40.0
18
+ - name: Install dependencies
19
+ run: bundle install
20
+ - name: Run tests
21
+ run: bundle exec rake test
22
+ - name: Lint
23
+ run: bundle exec rubocop
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ .byebug_history
@@ -0,0 +1,84 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ Style/AsciiComments:
4
+ Enabled: false
5
+
6
+ Naming/MethodParameterName:
7
+ Exclude:
8
+ - 'lib/regexgen/regex.rb'
9
+ - 'lib/regexgen/ast.rb'
10
+
11
+ Layout/EmptyLinesAroundAttributeAccessor:
12
+ Enabled: true
13
+ Layout/SpaceAroundMethodCallOperator:
14
+ Enabled: true
15
+ Lint/BinaryOperatorWithIdenticalOperands:
16
+ Enabled: true
17
+ Lint/DeprecatedOpenSSLConstant:
18
+ Enabled: true
19
+ Lint/DuplicateElsifCondition:
20
+ Enabled: true
21
+ Lint/DuplicateRescueException:
22
+ Enabled: true
23
+ Lint/EmptyConditionalBody:
24
+ Enabled: true
25
+ Lint/FloatComparison:
26
+ Enabled: true
27
+ Lint/MissingSuper:
28
+ Enabled: true
29
+ Lint/MixedRegexpCaptureTypes:
30
+ Enabled: true
31
+ Lint/OutOfRangeRegexpRef:
32
+ Enabled: true
33
+ Lint/RaiseException:
34
+ Enabled: true
35
+ Lint/SelfAssignment:
36
+ Enabled: true
37
+ Lint/StructNewOverride:
38
+ Enabled: true
39
+ Lint/TopLevelReturnWithArgument:
40
+ Enabled: true
41
+ Lint/UnreachableLoop:
42
+ Enabled: true
43
+ Style/AccessorGrouping:
44
+ Enabled: true
45
+ Style/ArrayCoercion:
46
+ Enabled: true
47
+ Style/BisectedAttrAccessor:
48
+ Enabled: true
49
+ Style/CaseLikeIf:
50
+ Enabled: true
51
+ Style/ExplicitBlockArgument:
52
+ Enabled: true
53
+ Style/ExponentialNotation:
54
+ Enabled: true
55
+ Style/GlobalStdStream:
56
+ Enabled: true
57
+ Style/HashAsLastArrayItem:
58
+ Enabled: true
59
+ Style/HashEachMethods:
60
+ Enabled: true
61
+ Style/HashLikeCase:
62
+ Enabled: true
63
+ Style/HashTransformKeys:
64
+ Enabled: true
65
+ Style/HashTransformValues:
66
+ Enabled: true
67
+ Style/OptionalBooleanParameter:
68
+ Enabled: true
69
+ Style/RedundantAssignment:
70
+ Enabled: true
71
+ Style/RedundantFetchBlock:
72
+ Enabled: true
73
+ Style/RedundantFileExtensionInRequire:
74
+ Enabled: true
75
+ Style/RedundantRegexpCharacterClass:
76
+ Enabled: true
77
+ Style/RedundantRegexpEscape:
78
+ Enabled: true
79
+ Style/SingleArgumentDig:
80
+ Enabled: true
81
+ Style/SlicingWithRange:
82
+ Enabled: true
83
+ Style/StringConcatenation:
84
+ Enabled: true
@@ -0,0 +1,43 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2020-08-10 13:40:30 UTC using RuboCop version 0.89.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 5
10
+ # Configuration parameters: IgnoredMethods.
11
+ Metrics/AbcSize:
12
+ Max: 66
13
+
14
+ # Offense count: 7
15
+ # Configuration parameters: IgnoredMethods.
16
+ Metrics/CyclomaticComplexity:
17
+ Max: 22
18
+
19
+ # Offense count: 6
20
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods.
21
+ Metrics/MethodLength:
22
+ Max: 39
23
+
24
+ # Offense count: 1
25
+ # Configuration parameters: CountComments, CountAsOne.
26
+ Metrics/ModuleLength:
27
+ Max: 103
28
+
29
+ # Offense count: 6
30
+ # Configuration parameters: IgnoredMethods.
31
+ Metrics/PerceivedComplexity:
32
+ Max: 24
33
+
34
+ # Offense count: 5
35
+ Style/Documentation:
36
+ Exclude:
37
+ - 'spec/**/*'
38
+ - 'test/**/*'
39
+ - 'lib/regexgen/minimize.rb'
40
+ - 'lib/regexgen/regex.rb'
41
+ - 'lib/regexgen/set.rb'
42
+ - 'lib/regexgen/state.rb'
43
+ - 'lib/regexgen/trie.rb'
@@ -0,0 +1 @@
1
+ 2.6.5
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.6.5
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in regexgen.gemspec
6
+ gemspec
7
+
8
+ gem 'minitest', '~> 5.0'
9
+ gem 'rake', '~> 12.0'
@@ -0,0 +1,44 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ regexgen (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.1)
10
+ byebug (11.1.3)
11
+ minitest (5.14.1)
12
+ parallel (1.19.2)
13
+ parser (2.7.1.4)
14
+ ast (~> 2.4.1)
15
+ rainbow (3.0.0)
16
+ rake (12.3.3)
17
+ regexp_parser (1.7.1)
18
+ rexml (3.2.4)
19
+ rubocop (0.89.0)
20
+ parallel (~> 1.10)
21
+ parser (>= 2.7.1.1)
22
+ rainbow (>= 2.2.2, < 4.0)
23
+ regexp_parser (>= 1.7)
24
+ rexml
25
+ rubocop-ast (>= 0.1.0, < 1.0)
26
+ ruby-progressbar (~> 1.7)
27
+ unicode-display_width (>= 1.4.0, < 2.0)
28
+ rubocop-ast (0.3.0)
29
+ parser (>= 2.7.1.4)
30
+ ruby-progressbar (1.10.1)
31
+ unicode-display_width (1.7.0)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ byebug (~> 11)
38
+ minitest (~> 5.0)
39
+ rake (~> 12.0)
40
+ regexgen!
41
+ rubocop (~> 0.89)
42
+
43
+ BUNDLED WITH
44
+ 2.1.4
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Aaron Madlon-Kay
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,85 @@
1
+ # regexgen
2
+
3
+ Generate regular expressions that match a set of strings.
4
+
5
+ This is a Ruby port of [@devongovett](https://github.com/devongovett/regexgen)'s
6
+ JavaScript [regexgen](https://github.com/devongovett/regexgen) package.
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem 'regexgen'
14
+ ```
15
+
16
+ And then execute:
17
+
18
+ $ bundle install
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install regexgen
23
+
24
+ ## Usage
25
+
26
+ ```ruby
27
+ require 'regexgen'
28
+
29
+ Regexgen.generate(['foobar', 'foobaz', 'foozap', 'fooza']) #=> /foo(?:zap?|ba[rz])/
30
+ ```
31
+
32
+ ## Unicode handling
33
+
34
+ Unlike the JavaScript version, this package does not do any special Unicode
35
+ handling because Ruby does it all for you. You are recommended to use a Unicode
36
+ encoding for your strings.
37
+
38
+ ## How does it work?
39
+
40
+ Just like the JavaScript version:
41
+
42
+ 1. Generate a [Trie](https://en.wikipedia.org/wiki/Trie) containing all of the
43
+ input strings. This is a tree structure where each edge represents a single
44
+ character. This removes redundancies at the start of the strings, but common
45
+ branches further down are not merged.
46
+
47
+ 2. A trie can be seen as a tree-shaped deterministic finite automaton (DFA), so
48
+ DFA algorithms can be applied. In this case, we apply [Hopcroft's DFA
49
+ minimization
50
+ algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm)
51
+ to merge the nondistinguishable states.
52
+
53
+ 3. Convert the resulting minimized DFA to a regular expression. This is done
54
+ using [Brzozowski's algebraic
55
+ method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392),
56
+ which is quite elegant. It expresses the DFA as a system of equations which
57
+ can be solved for a resulting regex. Along the way, some additional
58
+ optimizations are made, such as hoisting common substrings out of an
59
+ alternation, and using character class ranges. This produces an an [Abstract
60
+ Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST) for
61
+ the regex, which is then converted to a string and compiled to a Ruby
62
+ `Regexp` object.
63
+
64
+ ## Development
65
+
66
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
67
+ `rake test` to run the tests. You can also run `bin/console` for an interactive
68
+ prompt that will allow you to experiment.
69
+
70
+ To install this gem onto your local machine, run `bundle exec rake install`. To
71
+ release a new version, update the version number in `version.rb`, and then run
72
+ `bundle exec rake release`, which will create a git tag for the version, push
73
+ git commits and tags, and push the `.gem` file to
74
+ [rubygems.org](https://rubygems.org).
75
+
76
+ ## Contributing
77
+
78
+ Bug reports and pull requests are welcome on GitHub at
79
+ https://github.com/amake/regexgen.
80
+
81
+
82
+ ## License
83
+
84
+ The gem is available as open source under the terms of the [MIT
85
+ License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rake/testtask'
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/**/*_test.rb']
10
+ end
11
+
12
+ task default: :test
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'regexgen'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/version'
4
+ require 'regexgen/trie'
5
+
6
+ # Generate regular expressions that match a set of strings
7
+ module Regexgen
8
+ class <<self
9
+ def generate(strings, flags = nil)
10
+ Trie.new.tap { |t| strings.each(&t.method(:add)) }
11
+ .to_regex(flags)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ # Classes in the abstract syntax tree representation
5
+ module Ast
6
+ # Represents an alternation (e.g. `foo|bar`)
7
+ class Alternation
8
+ attr_reader :precedence, :options
9
+
10
+ def initialize(*options)
11
+ @precedence = 1
12
+ @options = flatten(options).sort { |a, b| b.length - a.length }
13
+ end
14
+
15
+ def length
16
+ @options[0].length
17
+ end
18
+
19
+ def to_s
20
+ @options.map { |o| Ast.parens(o, self) }.join('|')
21
+ end
22
+
23
+ private
24
+
25
+ def flatten(options)
26
+ options.map { |option| option.is_a?(Alternation) ? flatten(option.options) : option }
27
+ .flatten
28
+ end
29
+ end
30
+
31
+ # Represents a character class (e.g. [0-9a-z])
32
+ class CharClass
33
+ attr_reader :precedence
34
+
35
+ def initialize(a, b)
36
+ @precedence = 1
37
+ @set = [a, b].flatten
38
+ end
39
+
40
+ def length
41
+ 1
42
+ end
43
+
44
+ def single_character?
45
+ @set.none? { |c| c.ord > 0xffff }
46
+ end
47
+
48
+ def single_codepoint?
49
+ true
50
+ end
51
+
52
+ def to_s
53
+ "[#{to_ranges_string}]"
54
+ end
55
+
56
+ def char_class
57
+ @set
58
+ end
59
+
60
+ private
61
+
62
+ def to_ranges_string
63
+ to_ranges.map do |first, last|
64
+ if first == last
65
+ first
66
+ elsif first.ord.next == last.ord
67
+ "#{first}#{last}"
68
+ else
69
+ "#{first}-#{last}"
70
+ end
71
+ end.join
72
+ end
73
+
74
+ def to_ranges
75
+ set = @set.sort
76
+ ranges = [[set.first, set.first]]
77
+ set.drop(1).each_with_object(ranges) do |c, acc|
78
+ if acc.last.last.ord.next == c.ord
79
+ acc.last[-1] = c
80
+ else
81
+ acc << [c, c]
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ # Represents a concatenation (e.g. `foo`)
88
+ class Concatenation
89
+ attr_reader :precedence, :a, :b
90
+
91
+ def initialize(a, b)
92
+ @precedence = 2
93
+ @a = a
94
+ @b = b
95
+ end
96
+
97
+ def length
98
+ @a.length + @b.length
99
+ end
100
+
101
+ def to_s
102
+ Ast.parens(@a, self) + Ast.parens(@b, self)
103
+ end
104
+
105
+ def literal(side)
106
+ return @a.literal(side) if side == :start && @a.respond_to?(:literal)
107
+
108
+ @b.literal(side) if side == :end && @b.respond_to?(:literal)
109
+ end
110
+
111
+ def remove_substring(side, len)
112
+ a = @a
113
+ b = @b
114
+ a = @a.remove_substring(side, len) if side == :start && @a.respond_to?(:remove_substring)
115
+ b = @b.remove_substring(side, len) if side == :end && @b.respond_to?(:remove_substring)
116
+
117
+ return b if a.respond_to?(:empty?) && a.empty?
118
+ return a if b.respond_to?(:empty?) && b.empty?
119
+
120
+ Concatenation.new(a, b)
121
+ end
122
+ end
123
+
124
+ # Represents a repetition (e.g. `a*` or `a?`)
125
+ class Repetition
126
+ attr_reader :precedence, :expr, :type
127
+
128
+ def initialize(expr, type)
129
+ @precedence = 3
130
+ @expr = expr
131
+ @type = type
132
+ end
133
+
134
+ def length
135
+ @expr.length
136
+ end
137
+
138
+ def to_s
139
+ Ast.parens(@expr, self) + @type
140
+ end
141
+ end
142
+
143
+ # Represents a literal (e.g. a string)
144
+ class Literal
145
+ attr_reader :precedence, :value
146
+
147
+ def initialize(value)
148
+ @precedence = 2
149
+ @value = value
150
+ end
151
+
152
+ def empty?
153
+ @value.empty?
154
+ end
155
+
156
+ def single_character?
157
+ length == 1
158
+ end
159
+
160
+ def single_codepoint?
161
+ @value.codepoints.length == 1
162
+ end
163
+
164
+ def length
165
+ @value.length
166
+ end
167
+
168
+ def to_s
169
+ Regexp.escape(@value)
170
+ end
171
+
172
+ def char_class
173
+ @value if single_codepoint?
174
+ end
175
+
176
+ def literal(_side = nil)
177
+ @value
178
+ end
179
+
180
+ def remove_substring(side, len)
181
+ return Literal.new(@value[len..]) if side == :start
182
+ return Literal.new(@value[0...(@value.length - len)]) if side == :end
183
+ end
184
+ end
185
+
186
+ class<<self
187
+ def parens(exp, parent)
188
+ str = exp.to_s
189
+ if exp.precedence < parent.precedence
190
+ unless exp.respond_to?(:single_character?) && exp.single_character?
191
+ return "(?:#{str})" unless exp.respond_to?(:single_codepoint?) && exp.single_codepoint?
192
+ end
193
+ end
194
+
195
+ str
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/set'
4
+
5
+ module Regexgen
6
+ using SetUtil
7
+
8
+ class<<self
9
+ # Hopcroft's DSA minimization algorithm
10
+ # https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
11
+ #
12
+ # Largely ported from
13
+ # https://github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js
14
+ #
15
+ # P := {F, Q \ F};
16
+ # W := {F, Q \ F};
17
+ # while (W is not empty) do
18
+ # choose and remove a set A from W
19
+ # for each c in Σ do
20
+ # let X be the set of states for which a transition on c leads to a state in A
21
+ # for each set Y in P for which X ∩ Y is nonempty and Y \ X is nonempty do
22
+ # replace Y in P by the two sets X ∩ Y and Y \ X
23
+ # if Y is in W
24
+ # replace Y in W by the same two sets
25
+ # else
26
+ # if |X ∩ Y| <= |Y \ X|
27
+ # add X ∩ Y to W
28
+ # else
29
+ # add Y \ X to W
30
+ # end;
31
+ # end;
32
+ # end;
33
+ #
34
+ # Key:
35
+ #
36
+ # {...} is a Set (yes we have sets of sets here)
37
+ # A \ B is complement (A - B)
38
+ # Q is all states
39
+ # F is final states
40
+ # Σ is the DFA's alphabet
41
+ # c is a letter of the alphabet
42
+ # A ∩ B is intersection (A & B)
43
+ # |A| is the cardinality of A (A.size)
44
+ def minimize(root, alphabet)
45
+ states = root.visit
46
+ final_states = states.select(&:accepting).to_set
47
+
48
+ p = Set[states, final_states]
49
+ w = Set.new(p)
50
+ until w.empty?
51
+ a = w.shift
52
+ alphabet.each do |c|
53
+ x = states.each_with_object(Set.new) do |s, acc|
54
+ next unless s.transitions.key?(c)
55
+
56
+ acc.add(s) if a.include?(s.transitions[c])
57
+ end
58
+ p.to_a.each do |y|
59
+ intersection = x & y
60
+ next if intersection.empty?
61
+
62
+ complement = y - x
63
+ next if complement.empty?
64
+
65
+ p.replace(y, intersection, complement)
66
+ if w.include?(y)
67
+ w.replace(y, intersection, complement)
68
+ elsif intersection.size <= complement.size
69
+ w.add(intersection)
70
+ else
71
+ w.add(complement)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ new_states = Hash.new { |hash, key| hash[key] = State.new }
78
+ initial = nil
79
+
80
+ p.each do |s|
81
+ first = s.first
82
+ s_ = new_states[s]
83
+ first.transitions.each do |c, old|
84
+ s_.transitions[c] = new_states[p.find { |v| v.include?(old) }]
85
+ end
86
+
87
+ s_.accepting = first.accepting
88
+
89
+ initial = s_ if s.include?(root)
90
+ end
91
+
92
+ initial
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/ast'
4
+
5
+ module Regexgen
6
+ class<<self
7
+ def to_regex(root)
8
+ states = root.visit.to_a
9
+
10
+ a = []
11
+ b = []
12
+
13
+ states.each_with_index do |a_, i|
14
+ b[i] = Ast::Literal.new('') if a_.accepting
15
+
16
+ a[i] = []
17
+ a_.transitions.each do |t, s|
18
+ j = states.index(s)
19
+ a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t)
20
+ end
21
+ end
22
+
23
+ (states.length - 1).downto(0) do |n|
24
+ if a[n][n]
25
+ b[n] = concat(star(a[n][n], b[n]))
26
+ (0...n).each do |j|
27
+ a[n][j] = concat(start(a[n][n]), a[n][j])
28
+ end
29
+ end
30
+
31
+ (0...n).each do |i|
32
+ next unless a[i][n]
33
+
34
+ b[i] = union(b[i], concat(a[i][n], b[n]))
35
+ (0...n).each do |j|
36
+ a[i][j] = union(a[i][j], concat(a[i][n], a[n][j]))
37
+ end
38
+ end
39
+ end
40
+
41
+ b[0].to_s
42
+ end
43
+
44
+ def star(exp)
45
+ Ast::Repetition.new(exp, '*') if exp
46
+ end
47
+
48
+ def union(a, b)
49
+ if a && b && a != b
50
+ res = nil
51
+ a, b, start = remove_common_substring(a, b, :start)
52
+ a, b, end_ = remove_common_substring(a, b, :end)
53
+
54
+ if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?)
55
+ res = Ast::Repetition.new(a.empty? ? b : a, '?')
56
+ elsif a.is_a?(Ast::Repetition) && a.type == '?'
57
+ res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?')
58
+ elsif b.is_a?(Ast::Repetition) && b.type == '?'
59
+ res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?')
60
+ else
61
+ ac = a.char_class if a.respond_to?(:char_class)
62
+ bc = b.char_class if b.respond_to?(:char_class)
63
+ res = if ac && bc
64
+ Ast::CharClass.new(ac, bc)
65
+ else
66
+ Ast::Alternation.new(a, b)
67
+ end
68
+ end
69
+
70
+ res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty?
71
+
72
+ res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty?
73
+
74
+ return res
75
+ end
76
+
77
+ a || b
78
+ end
79
+
80
+ def remove_common_substring(a, b, side)
81
+ al = a.literal(side) if a.respond_to?(:literal)
82
+ bl = b.literal(side) if b.respond_to?(:literal)
83
+ return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty?
84
+
85
+ s = common_substring(al, bl, side)
86
+ return [a, b, ''] if s.empty?
87
+
88
+ a = a.remove_substring(side, s.length)
89
+ b = b.remove_substring(side, s.length)
90
+
91
+ [a, b, s]
92
+ end
93
+
94
+ def common_substring(a, b, side)
95
+ dir = side == :start ? 1 : -1
96
+ a = a.chars
97
+ b = b.chars
98
+ ai = dir == 1 ? 0 : a.length - 1
99
+ ae = dir == 1 ? a.length : -1
100
+ bi = dir == 1 ? 0 : b.length - 1
101
+ be = dir == 1 ? b.length : -1
102
+ res = ''
103
+
104
+ while ai != ae && bi != be && a[ai] == b[bi]
105
+ if dir == 1
106
+ res += a[ai]
107
+ else
108
+ res = a[ai] + res
109
+ end
110
+ ai += dir
111
+ bi += dir
112
+ end
113
+
114
+ res
115
+ end
116
+
117
+ def concat(a, b)
118
+ return unless a && b
119
+
120
+ return b if a.respond_to?(:empty?) && a.empty?
121
+ return a if b.respond_to?(:empty?) && b.empty?
122
+
123
+ return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal)
124
+
125
+ if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal)
126
+ return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b)
127
+ end
128
+
129
+ if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal)
130
+ return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value))
131
+ end
132
+
133
+ Ast::Concatenation.new(a, b)
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ module SetUtil
5
+ refine Set do
6
+ def shift
7
+ item = first
8
+ delete(first)
9
+ item
10
+ end
11
+
12
+ def replace(search, *replacements)
13
+ raise unless delete?(search)
14
+
15
+ merge(replacements)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ class State
5
+ attr_accessor :accepting
6
+ attr_reader :transitions
7
+
8
+ def initialize
9
+ @accepting = false
10
+ @transitions = Hash.new { |hash, key| hash[key] = State.new }
11
+ end
12
+
13
+ def visit(visited = Set.new)
14
+ return visited if visited.include?(self)
15
+
16
+ visited.add(self)
17
+ @transitions.each_value { |state| state.visit(visited) }
18
+ visited
19
+ end
20
+
21
+ def to_h
22
+ @transitions.transform_values(&:to_h).tap do |h|
23
+ h[''] = nil if @accepting
24
+ end
25
+ end
26
+
27
+ def to_s
28
+ sigil = @accepting ? '*' : ''
29
+ "#{sigil}#{to_h}"
30
+ end
31
+
32
+ alias inspect to_s
33
+ end
34
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexgen/state'
4
+ require 'regexgen/minimize'
5
+ require 'regexgen/regex'
6
+
7
+ module Regexgen
8
+ class Trie
9
+ attr_reader :root, :alphabet
10
+
11
+ def initialize
12
+ @alphabet = Set.new
13
+ @root = State.new
14
+ end
15
+
16
+ def add(str)
17
+ node = @root
18
+ str.each_char do |char|
19
+ @alphabet.add(char)
20
+ node = node.transitions[char]
21
+ end
22
+ node.accepting = true
23
+ end
24
+
25
+ def add_all(strs)
26
+ strs.each(&method(:add))
27
+ end
28
+
29
+ def minimize
30
+ Regexgen.minimize(@root, @alphabet)
31
+ end
32
+
33
+ def to_s
34
+ Regexgen.to_regex(minimize)
35
+ end
36
+
37
+ def to_regex(flags = nil)
38
+ flags_i = 0
39
+ flags_i |= Regexp::EXTENDED if flags&.include?('x')
40
+ flags_i |= Regexp::IGNORECASE if flags&.include?('i')
41
+ flags_i |= Regexp::MULTILINE if flags&.include?('m')
42
+ Regexp.new(to_s, flags_i)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Regexgen
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/regexgen/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'regexgen'
7
+ spec.version = Regexgen::VERSION
8
+ spec.authors = ['Aaron Madlon-Kay']
9
+ spec.email = ['aaron@madlon-kay.com']
10
+
11
+ spec.summary = 'Generate a minimal regex matching a set of strings'
12
+ spec.homepage = 'https://github.com/aaron/regexgen-ruby'
13
+ spec.license = 'MIT'
14
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = 'https://github.com/aaron/regexgen-ruby.git'
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ end
24
+ spec.bindir = 'exe'
25
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_development_dependency 'byebug', '~> 11'
29
+ spec.add_development_dependency 'rubocop', '~> 0.89'
30
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regexgen
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Madlon-Kay
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-08-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: byebug
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.89'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.89'
41
+ description:
42
+ email:
43
+ - aaron@madlon-kay.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".github/workflows/main.yml"
49
+ - ".gitignore"
50
+ - ".rubocop.yml"
51
+ - ".rubocop_todo.yml"
52
+ - ".ruby-version"
53
+ - ".travis.yml"
54
+ - Gemfile
55
+ - Gemfile.lock
56
+ - LICENSE.txt
57
+ - README.md
58
+ - Rakefile
59
+ - bin/console
60
+ - bin/setup
61
+ - lib/regexgen.rb
62
+ - lib/regexgen/ast.rb
63
+ - lib/regexgen/minimize.rb
64
+ - lib/regexgen/regex.rb
65
+ - lib/regexgen/set.rb
66
+ - lib/regexgen/state.rb
67
+ - lib/regexgen/trie.rb
68
+ - lib/regexgen/version.rb
69
+ - regexgen.gemspec
70
+ homepage: https://github.com/aaron/regexgen-ruby
71
+ licenses:
72
+ - MIT
73
+ metadata:
74
+ homepage_uri: https://github.com/aaron/regexgen-ruby
75
+ source_code_uri: https://github.com/aaron/regexgen-ruby.git
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 2.3.0
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.0.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Generate a minimal regex matching a set of strings
95
+ test_files: []