regexp_trie 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/CI.yml +31 -0
- data/README.md +1 -1
- data/lib/regexp_trie/version.rb +1 -1
- data/lib/regexp_trie.rb +3 -1
- data/regexp_trie.gemspec +7 -2
- metadata +8 -11
- data/.travis.yml +0 -7
- data/example/benchmark.rb +0 -90
- data/example/hatena-keyword-list.csv +0 -454722
- data/example/synopsis.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b7c0815c1b7b8f4bc6205ddaed11d259531f39bdcf6ac4a08a1edb965019d75
|
4
|
+
data.tar.gz: 9321a016f40fe9ffd441dbcc41068440d2fc1cea4ff49eb1c5efa9e559590113
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9cba5c7d1a9379061c813e8829c52e2e69915824814d7b156f492a464a6d0bee21b0fbad041263779303104215388a69610cd796f0fe633f143125feb2e9b64
|
7
|
+
data.tar.gz: 7877a5f840407312b18f2d6644ebff0c71f9c0692450c365b953dcb42b4c8e4a45f2cbd5f8583dc6408a39ca521726eb577df8de7054b369c9003b0916589ba6
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ "master" ]
|
6
|
+
pull_request:
|
7
|
+
|
8
|
+
permissions:
|
9
|
+
contents: read
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
test:
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
strategy:
|
15
|
+
matrix:
|
16
|
+
ruby-version:
|
17
|
+
- '2.5'
|
18
|
+
- '2.6'
|
19
|
+
- '2.7'
|
20
|
+
- '3.0'
|
21
|
+
- '3.1'
|
22
|
+
|
23
|
+
steps:
|
24
|
+
- uses: actions/checkout@v3
|
25
|
+
- name: Set up Ruby
|
26
|
+
uses: ruby/setup-ruby@v1
|
27
|
+
with:
|
28
|
+
ruby-version: '${{ matrix.ruby-version }}'
|
29
|
+
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
|
30
|
+
- name: Run tests
|
31
|
+
run: bundle exec rake
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# RegexpTrie [![
|
1
|
+
# RegexpTrie [![CI](https://github.com/gfx/ruby-regexp_trie/actions/workflows/CI.yml/badge.svg)](https://github.com/gfx/ruby-regexp_trie/actions/workflows/CI.yml) [![Gem Version](https://badge.fury.io/rb/regexp_trie.svg)](https://badge.fury.io/rb/regexp_trie)
|
2
2
|
|
3
3
|
## Synopsis
|
4
4
|
|
data/lib/regexp_trie/version.rb
CHANGED
data/lib/regexp_trie.rb
CHANGED
@@ -4,8 +4,10 @@ require_relative "regexp_trie/version"
|
|
4
4
|
|
5
5
|
class RegexpTrie
|
6
6
|
|
7
|
+
# Factly method to call `new(*strings).to_regexp(option)` in short.
|
8
|
+
#
|
7
9
|
# @param [Array<String>] strings Set of patterns
|
8
|
-
# @param [
|
10
|
+
# @param [Integer,Boolean] option The second argument of `Regexp.new()` passed to build a regexp instance
|
9
11
|
# @return [Regexp]
|
10
12
|
def self.union(*strings, option: nil)
|
11
13
|
new(*strings).to_regexp(option)
|
data/regexp_trie.gemspec
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require_relative './lib/regexp_trie/version'
|
3
5
|
|
4
6
|
Gem::Specification.new do |spec|
|
@@ -11,10 +13,13 @@ Gem::Specification.new do |spec|
|
|
11
13
|
spec.description = %q{Optimized Regexp builder with Trie as a port of Perl's Regexp::Trie}
|
12
14
|
spec.homepage = "https://github.com/gfx/ruby-regexp_trie"
|
13
15
|
spec.license = "MIT"
|
16
|
+
spec.metadata = {
|
17
|
+
"source_code_uri" => "https://github.com/gfx/ruby-regexp_trie",
|
18
|
+
"allowed_push_host" => "https://rubygems.org/"
|
19
|
+
}
|
14
20
|
|
15
|
-
spec.metadata['allowed_push_host'] = 'https://rubygems.org/'
|
16
21
|
|
17
|
-
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features|example)/}) }
|
18
23
|
spec.bindir = "exe"
|
19
24
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
25
|
spec.require_paths = ["lib"]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp_trie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- FUJI Goro (gfx)
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -87,8 +87,8 @@ executables: []
|
|
87
87
|
extensions: []
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
|
+
- ".github/workflows/CI.yml"
|
90
91
|
- ".gitignore"
|
91
|
-
- ".travis.yml"
|
92
92
|
- CHANGES.md
|
93
93
|
- Gemfile
|
94
94
|
- LICENSE.txt
|
@@ -96,9 +96,6 @@ files:
|
|
96
96
|
- Rakefile
|
97
97
|
- bin/console
|
98
98
|
- bin/setup
|
99
|
-
- example/benchmark.rb
|
100
|
-
- example/hatena-keyword-list.csv
|
101
|
-
- example/synopsis.rb
|
102
99
|
- lib/regexp_trie.rb
|
103
100
|
- lib/regexp_trie/version.rb
|
104
101
|
- regexp_trie.gemspec
|
@@ -106,8 +103,9 @@ homepage: https://github.com/gfx/ruby-regexp_trie
|
|
106
103
|
licenses:
|
107
104
|
- MIT
|
108
105
|
metadata:
|
106
|
+
source_code_uri: https://github.com/gfx/ruby-regexp_trie
|
109
107
|
allowed_push_host: https://rubygems.org/
|
110
|
-
post_install_message:
|
108
|
+
post_install_message:
|
111
109
|
rdoc_options: []
|
112
110
|
require_paths:
|
113
111
|
- lib
|
@@ -122,9 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
120
|
- !ruby/object:Gem::Version
|
123
121
|
version: '0'
|
124
122
|
requirements: []
|
125
|
-
|
126
|
-
|
127
|
-
signing_key:
|
123
|
+
rubygems_version: 3.2.15
|
124
|
+
signing_key:
|
128
125
|
specification_version: 4
|
129
126
|
summary: Optimized Regexp builder with Trie
|
130
127
|
test_files: []
|
data/.travis.yml
DELETED
data/example/benchmark.rb
DELETED
@@ -1,90 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'benchmark'
|
3
|
-
require 'diffy'
|
4
|
-
require 'regexp_trie'
|
5
|
-
|
6
|
-
keywords = []
|
7
|
-
File.open('example/hatena-keyword-list.csv') do |io|
|
8
|
-
io.each do |line|
|
9
|
-
yomi, word = line.split(/\t/)
|
10
|
-
word.strip!
|
11
|
-
unless word.empty?
|
12
|
-
keywords.push(word)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
puts "build regexp ..."
|
18
|
-
|
19
|
-
keywords.sort_by! { |item| -item.length }
|
20
|
-
|
21
|
-
rx_raw = Regexp.union(keywords)
|
22
|
-
rx_trie = RegexpTrie.union(keywords)
|
23
|
-
|
24
|
-
puts "rx_raw: #{rx_raw.to_s.length}"
|
25
|
-
puts "rx_trie: #{rx_trie.to_s.length}"
|
26
|
-
|
27
|
-
text = <<'EOS'
|
28
|
-
http://blog.livedoor.jp/dankogai/archives/50074802.html
|
29
|
-
|
30
|
-
TRIE-Optimized Regexp [Show on Hatena Bookmark]
|
31
|
-
これをPerlで直接使えたらうれしいよね>おおる
|
32
|
-
|
33
|
-
きまぐれ日記: はてなキーワードを高速に付与
|
34
|
-
そこで、はてなキーワードを TRIE を使って付与するプログラムを作ってみました。
|
35
|
-
というわけで、やってみました。
|
36
|
-
|
37
|
-
|
38
|
-
最初はDartsのXSを作ろうとしたのだけど、どうもtemplateばりばりのC++コードとXSは相性が悪い。でもTrieを作るだけなら、Perlでもそこそこ出来るし、実際Regexp::OptimizerやRegexp::Assembleのようなモジュールもある。ただこれらはTrie以外のOptimizeもしてしまうので、ちょっと重たいというわけで、mk_trie_regexp.plというScriptをサクっと書いてみました。
|
39
|
-
|
40
|
-
使い方は簡単。/usr/share/dict/wordsのような、一行一語のファイルを引数に指定すると、それに対応した正規表現を吐いてくれます。あとはそれを
|
41
|
-
|
42
|
-
my $re = do "keyword.list.rx";
|
43
|
-
とかして読み込めばOK。
|
44
|
-
|
45
|
-
しかし、はてなのキーワードリストはすでにRegexpとして書かれちゃっているので、これを戻す為にhatena2list.plというscriptも書いときました。
|
46
|
-
|
47
|
-
そしてベンチマークを取った結果が以下です。
|
48
|
-
|
49
|
-
PowerBook G4 1.67MHz / Mac OS X v10.4
|
50
|
-
(warning: too few iterations for a reliable count)
|
51
|
-
s/iter comp_raw comp_trie
|
52
|
-
comp_raw 4.61 -- -87%
|
53
|
-
comp_trie 0.592 679% --
|
54
|
-
Rate pm_raw pm_trie
|
55
|
-
pm_raw 156/s -- -100%
|
56
|
-
pm_trie 70337/s 44874% --
|
57
|
-
(warning: too few iterations for a reliable count)
|
58
|
-
s/iter nm_raw nm_trie
|
59
|
-
nm_raw 23.6 -- -100%
|
60
|
-
nm_trie 1.57e-02 150763% --
|
61
|
-
Dual Xeon 2.66MHz / FreeBSD 5.4-Stable
|
62
|
-
(warning: too few iterations for a reliable count)
|
63
|
-
s/iter comp_raw comp_trie
|
64
|
-
comp_raw 4.45 -- -90%
|
65
|
-
comp_trie 0.465 855% --
|
66
|
-
Rate pm_raw pm_trie
|
67
|
-
pm_raw 532/s -- -99%
|
68
|
-
pm_trie 92027/s 17197% --
|
69
|
-
(warning: too few iterations for a reliable count)
|
70
|
-
s/iter nm_raw nm_trie
|
71
|
-
nm_raw 6.91 -- -100%
|
72
|
-
nm_trie 1.22e-02 56417% --
|
73
|
-
Darts版ほどとは行きませんが、なかなかPracticalなのではないでしょうか。なんといってもPerlから直接使える--正規表現そのものはRubyでも互換?--のはぐ~でしょう。
|
74
|
-
|
75
|
-
Dan the Just Another (Perl|Trie) Hacker
|
76
|
-
EOS
|
77
|
-
|
78
|
-
unless text.gsub(rx_raw, '*') == text.gsub(rx_trie, '*')
|
79
|
-
puts '!!!differences between Regexp.union() and RegexpTrie.union()!!!'
|
80
|
-
puts Diffy::Diff.new(text.gsub(rx_raw, '*'), text.gsub(rx_trie, '*'))
|
81
|
-
end
|
82
|
-
|
83
|
-
Benchmark.bm 20 do |r|
|
84
|
-
r.report "Regexp raw" do
|
85
|
-
text.gsub(rx_raw, '*')
|
86
|
-
end
|
87
|
-
r.report "RegexpTrie" do
|
88
|
-
text.gsub(rx_trie, '*')
|
89
|
-
end
|
90
|
-
end
|