srx 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dir-locals.el +4 -0
- data/.github/workflows/main.yml +21 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +13 -0
- data/.rubocop_todo.yml +33 -0
- data/.solargraph.yml +17 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/Rakefile +16 -0
- data/bin/benchmark +94 -0
- data/bin/console +15 -0
- data/bin/profile +33 -0
- data/bin/segment +28 -0
- data/bin/setup +8 -0
- data/lib/srx.rb +13 -0
- data/lib/srx/data.rb +169 -0
- data/lib/srx/engine.rb +136 -0
- data/lib/srx/format.rb +26 -0
- data/lib/srx/format/base_format.rb +38 -0
- data/lib/srx/format/text.rb +12 -0
- data/lib/srx/format/xml.rb +53 -0
- data/lib/srx/icu_regex.rb +22 -0
- data/lib/srx/srx-20-sample.srx +86 -0
- data/lib/srx/util.rb +16 -0
- data/lib/srx/version.rb +5 -0
- data/srx.gemspec +37 -0
- metadata +185 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'English'
|
4
|
+
|
5
|
+
module Srx
|
6
|
+
module Format
|
7
|
+
# Support for XML
|
8
|
+
#
|
9
|
+
# @see https://www.w3.org/TR/xml/
|
10
|
+
class Xml < BaseFormat
|
11
|
+
# rubocop:disable Layout/LineLength
|
12
|
+
NAME_START_CHAR = /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u{10000}-\u{EFFFF}]/.freeze
|
13
|
+
# rubocop:enable Layout/LineLength
|
14
|
+
NAME_CHAR = /#{NAME_START_CHAR}|[-.0-9\u00B7\u0300-\u036F\u203F-\u2040]/.freeze
|
15
|
+
NAME = /#{NAME_START_CHAR}#{NAME_CHAR}*/.freeze
|
16
|
+
SPACE = /[\u0020\u0009\u000D\u000A]+/.freeze
|
17
|
+
EQUALS = /#{SPACE}?=#{SPACE}?/.freeze
|
18
|
+
ENTITY_REF = /&#{NAME};/.freeze
|
19
|
+
CHAR_REF = /&#[0-9]+;|&#x[0-9a-fA-F]+;/.freeze
|
20
|
+
REFERENCE = /#{ENTITY_REF}|#{CHAR_REF}/.freeze
|
21
|
+
ATT_VALUE = /"(?:[^<&"]|#{REFERENCE})*"|'(?:[^<&']|#{REFERENCE})*'/.freeze
|
22
|
+
ATTRIBUTE = /#{NAME}#{EQUALS}#{ATT_VALUE}/.freeze
|
23
|
+
START_TAG = /<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?>/.freeze
|
24
|
+
END_TAG = %r{</#{NAME}#{SPACE}?>}.freeze
|
25
|
+
EMPTY_ELEM_TAG = %r{<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?/>}.freeze
|
26
|
+
|
27
|
+
TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
|
28
|
+
|
29
|
+
def extract_markups(str)
|
30
|
+
markups = []
|
31
|
+
|
32
|
+
plain_text = str.gsub(TAG) do |match|
|
33
|
+
markups << [$LAST_MATCH_INFO.begin(0), match]
|
34
|
+
''
|
35
|
+
end
|
36
|
+
|
37
|
+
[plain_text, markups]
|
38
|
+
end
|
39
|
+
|
40
|
+
def start_formatting?(markup)
|
41
|
+
START_TAG.match?(markup)
|
42
|
+
end
|
43
|
+
|
44
|
+
def end_formatting?(markup)
|
45
|
+
END_TAG.match?(markup)
|
46
|
+
end
|
47
|
+
|
48
|
+
def isolated_formatting?(markup)
|
49
|
+
EMPTY_ELEM_TAG.match?(markup)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Utilities for handling SRX (ICU) regular expressions
|
5
|
+
module IcuRegex
|
6
|
+
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# @param icu_regex [String]
|
10
|
+
# @return [String]
|
11
|
+
def to_ruby(icu_regex)
|
12
|
+
icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param icu_regex [String]
|
16
|
+
# @return [Regexp]
|
17
|
+
def compile(icu_regex)
|
18
|
+
Regexp.new(to_ruby(icu_regex))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<srx version="2.0"
|
3
|
+
xmlns="http://www.lisa.org/srx20"
|
4
|
+
xsi:schemaLocation="http://www.lisa.org/srx20 srx20.xsd"
|
5
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
6
|
+
<header segmentsubflows="yes" cascade="yes">
|
7
|
+
<formathandle type="start" include="no"/>
|
8
|
+
<formathandle type="end" include="yes"/>
|
9
|
+
<formathandle type="isolated" include="yes"/>
|
10
|
+
</header>
|
11
|
+
<body>
|
12
|
+
<languagerules>
|
13
|
+
<languagerule languagerulename="Default">
|
14
|
+
<!-- Common rules for most languages -->
|
15
|
+
<rule break="no">
|
16
|
+
<beforebreak>^\s*[0-9]+\.</beforebreak>
|
17
|
+
<afterbreak>\s</afterbreak>
|
18
|
+
</rule>
|
19
|
+
<rule break="yes">
|
20
|
+
<afterbreak>\n</afterbreak>
|
21
|
+
</rule>
|
22
|
+
<rule break="yes">
|
23
|
+
<beforebreak>[\.\?!]+</beforebreak>
|
24
|
+
<afterbreak>\s</afterbreak>
|
25
|
+
</rule>
|
26
|
+
</languagerule>
|
27
|
+
<languagerule languagerulename="English">
|
28
|
+
<!-- Some English abbreviations -->
|
29
|
+
<rule break="no">
|
30
|
+
<beforebreak>\s[Ee][Tt][Cc]\.</beforebreak>
|
31
|
+
<afterbreak>\s[a-z]</afterbreak>
|
32
|
+
</rule>
|
33
|
+
<rule break="no">
|
34
|
+
<beforebreak>\sMr\.</beforebreak>
|
35
|
+
<afterbreak>\s</afterbreak>
|
36
|
+
</rule>
|
37
|
+
<rule break="no">
|
38
|
+
<beforebreak>\sU\.K\.</beforebreak>
|
39
|
+
<afterbreak>\s</afterbreak>
|
40
|
+
</rule>
|
41
|
+
</languagerule>
|
42
|
+
<languagerule languagerulename="French">
|
43
|
+
<!-- Some French abbreviations -->
|
44
|
+
<rule break="no">
|
45
|
+
<beforebreak>\s[Mm]lle\.</beforebreak>
|
46
|
+
<afterbreak>\s</afterbreak>
|
47
|
+
</rule>
|
48
|
+
<rule break="no">
|
49
|
+
<beforebreak>\s[Mm]lles\.</beforebreak>
|
50
|
+
<afterbreak>\s</afterbreak>
|
51
|
+
</rule>
|
52
|
+
<rule break="no">
|
53
|
+
<beforebreak>\s[Mm]me\.</beforebreak>
|
54
|
+
<afterbreak>\s</afterbreak>
|
55
|
+
</rule>
|
56
|
+
<rule break="no">
|
57
|
+
<beforebreak>\s[Mm]mes\.</beforebreak>
|
58
|
+
<afterbreak>\s</afterbreak>
|
59
|
+
</rule>
|
60
|
+
</languagerule>
|
61
|
+
<languagerule languagerulename="Japanese">
|
62
|
+
<!-- Rules for breaking on Japanese punctuation
|
63
|
+
|
64
|
+
\xff61: Halfwidth ideographic full stop
|
65
|
+
\x3002: Ideographic full stop
|
66
|
+
\xff0e: Fullwidth full stop
|
67
|
+
\xff1f: Fullwidth question mark
|
68
|
+
\xff01: Fullwidth exclamation mark
|
69
|
+
-->
|
70
|
+
<rule break="yes">
|
71
|
+
<beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
|
72
|
+
<afterbreak></afterbreak>
|
73
|
+
</rule>
|
74
|
+
</languagerule>
|
75
|
+
</languagerules>
|
76
|
+
<maprules>
|
77
|
+
<!-- List exceptions first -->
|
78
|
+
<languagemap languagepattern="[Ee][Nn].*" languagerulename="English"/>
|
79
|
+
<languagemap languagepattern="[Ff][Rr].*" languagerulename="French"/>
|
80
|
+
<!-- Japanese breaking rules -->
|
81
|
+
<languagemap languagepattern="[Jj][Aa].*" languagerulename="Japanese"/>
|
82
|
+
<!-- Common breaking rules -->
|
83
|
+
<languagemap languagepattern=".*" languagerulename="Default"/>
|
84
|
+
</maprules>
|
85
|
+
</body>
|
86
|
+
</srx>
|
data/lib/srx/util.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Miscellaneous utility functions
|
5
|
+
module Util
|
6
|
+
class << self
|
7
|
+
# Remove linebreaks that wrap lines.
|
8
|
+
#
|
9
|
+
# @param str [String]
|
10
|
+
# @return [String]
|
11
|
+
def unwrap(str)
|
12
|
+
str.gsub(/(?<=\S)\n(?=\S)/, ' ')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/srx/version.rb
ADDED
data/srx.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/srx/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'srx'
|
7
|
+
spec.version = Srx::VERSION
|
8
|
+
spec.authors = ['Aaron Madlon-Kay']
|
9
|
+
spec.email = ['aaron@madlon-kay.com']
|
10
|
+
|
11
|
+
spec.summary = 'An SRX segmenting engine'
|
12
|
+
spec.homepage = 'https://github.com/amake/srx-ruby'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.4.0')
|
15
|
+
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
17
|
+
spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
spec.bindir = 'exe'
|
25
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
26
|
+
spec.require_paths = ['lib']
|
27
|
+
|
28
|
+
spec.add_dependency 'nokogiri', '~>1.11'
|
29
|
+
|
30
|
+
spec.add_development_dependency 'byebug'
|
31
|
+
spec.add_development_dependency 'memory_profiler'
|
32
|
+
spec.add_development_dependency 'minitest'
|
33
|
+
spec.add_development_dependency 'rake'
|
34
|
+
spec.add_development_dependency 'rspec-expectations'
|
35
|
+
spec.add_development_dependency 'rubocop'
|
36
|
+
spec.add_development_dependency 'solargraph'
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aaron Madlon-Kay
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-02-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.11'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.11'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: byebug
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: memory_profiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec-expectations
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: solargraph
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description:
|
126
|
+
email:
|
127
|
+
- aaron@madlon-kay.com
|
128
|
+
executables: []
|
129
|
+
extensions: []
|
130
|
+
extra_rdoc_files: []
|
131
|
+
files:
|
132
|
+
- ".dir-locals.el"
|
133
|
+
- ".github/workflows/main.yml"
|
134
|
+
- ".gitignore"
|
135
|
+
- ".rubocop.yml"
|
136
|
+
- ".rubocop_todo.yml"
|
137
|
+
- ".solargraph.yml"
|
138
|
+
- Gemfile
|
139
|
+
- Gemfile.lock
|
140
|
+
- LICENSE.txt
|
141
|
+
- README.md
|
142
|
+
- Rakefile
|
143
|
+
- bin/benchmark
|
144
|
+
- bin/console
|
145
|
+
- bin/profile
|
146
|
+
- bin/segment
|
147
|
+
- bin/setup
|
148
|
+
- lib/srx.rb
|
149
|
+
- lib/srx/data.rb
|
150
|
+
- lib/srx/engine.rb
|
151
|
+
- lib/srx/format.rb
|
152
|
+
- lib/srx/format/base_format.rb
|
153
|
+
- lib/srx/format/text.rb
|
154
|
+
- lib/srx/format/xml.rb
|
155
|
+
- lib/srx/icu_regex.rb
|
156
|
+
- lib/srx/srx-20-sample.srx
|
157
|
+
- lib/srx/util.rb
|
158
|
+
- lib/srx/version.rb
|
159
|
+
- srx.gemspec
|
160
|
+
homepage: https://github.com/amake/srx-ruby
|
161
|
+
licenses:
|
162
|
+
- MIT
|
163
|
+
metadata:
|
164
|
+
homepage_uri: https://github.com/amake/srx-ruby
|
165
|
+
source_code_uri: https://github.com/amake/srx-ruby.git
|
166
|
+
post_install_message:
|
167
|
+
rdoc_options: []
|
168
|
+
require_paths:
|
169
|
+
- lib
|
170
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
171
|
+
requirements:
|
172
|
+
- - ">="
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
version: 2.4.0
|
175
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - ">="
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0'
|
180
|
+
requirements: []
|
181
|
+
rubygems_version: 3.1.4
|
182
|
+
signing_key:
|
183
|
+
specification_version: 4
|
184
|
+
summary: An SRX segmenting engine
|
185
|
+
test_files: []
|