srx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dir-locals.el +4 -0
- data/.github/workflows/main.yml +21 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +13 -0
- data/.rubocop_todo.yml +33 -0
- data/.solargraph.yml +17 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/Rakefile +16 -0
- data/bin/benchmark +94 -0
- data/bin/console +15 -0
- data/bin/profile +33 -0
- data/bin/segment +28 -0
- data/bin/setup +8 -0
- data/lib/srx.rb +13 -0
- data/lib/srx/data.rb +169 -0
- data/lib/srx/engine.rb +136 -0
- data/lib/srx/format.rb +26 -0
- data/lib/srx/format/base_format.rb +38 -0
- data/lib/srx/format/text.rb +12 -0
- data/lib/srx/format/xml.rb +53 -0
- data/lib/srx/icu_regex.rb +22 -0
- data/lib/srx/srx-20-sample.srx +86 -0
- data/lib/srx/util.rb +16 -0
- data/lib/srx/version.rb +5 -0
- data/srx.gemspec +37 -0
- metadata +185 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'English'
|
4
|
+
|
5
|
+
module Srx
|
6
|
+
module Format
|
7
|
+
# Support for XML
|
8
|
+
#
|
9
|
+
# @see https://www.w3.org/TR/xml/
|
10
|
+
class Xml < BaseFormat
|
11
|
+
# rubocop:disable Layout/LineLength
|
12
|
+
NAME_START_CHAR = /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u{10000}-\u{EFFFF}]/.freeze
|
13
|
+
# rubocop:enable Layout/LineLength
|
14
|
+
NAME_CHAR = /#{NAME_START_CHAR}|[-.0-9\u00B7\u0300-\u036F\u203F-\u2040]/.freeze
|
15
|
+
NAME = /#{NAME_START_CHAR}#{NAME_CHAR}*/.freeze
|
16
|
+
SPACE = /[\u0020\u0009\u000D\u000A]+/.freeze
|
17
|
+
EQUALS = /#{SPACE}?=#{SPACE}?/.freeze
|
18
|
+
ENTITY_REF = /&#{NAME};/.freeze
|
19
|
+
CHAR_REF = /&#[0-9]+;|&#x[0-9a-fA-F]+;/.freeze
|
20
|
+
REFERENCE = /#{ENTITY_REF}|#{CHAR_REF}/.freeze
|
21
|
+
ATT_VALUE = /"(?:[^<&"]|#{REFERENCE})*"|'(?:[^<&']|#{REFERENCE})*'/.freeze
|
22
|
+
ATTRIBUTE = /#{NAME}#{EQUALS}#{ATT_VALUE}/.freeze
|
23
|
+
START_TAG = /<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?>/.freeze
|
24
|
+
END_TAG = %r{</#{NAME}#{SPACE}?>}.freeze
|
25
|
+
EMPTY_ELEM_TAG = %r{<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?/>}.freeze
|
26
|
+
|
27
|
+
TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
|
28
|
+
|
29
|
+
def extract_markups(str)
|
30
|
+
markups = []
|
31
|
+
|
32
|
+
plain_text = str.gsub(TAG) do |match|
|
33
|
+
markups << [$LAST_MATCH_INFO.begin(0), match]
|
34
|
+
''
|
35
|
+
end
|
36
|
+
|
37
|
+
[plain_text, markups]
|
38
|
+
end
|
39
|
+
|
40
|
+
def start_formatting?(markup)
|
41
|
+
START_TAG.match?(markup)
|
42
|
+
end
|
43
|
+
|
44
|
+
def end_formatting?(markup)
|
45
|
+
END_TAG.match?(markup)
|
46
|
+
end
|
47
|
+
|
48
|
+
def isolated_formatting?(markup)
|
49
|
+
EMPTY_ELEM_TAG.match?(markup)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Utilities for handling SRX (ICU) regular expressions
|
5
|
+
module IcuRegex
|
6
|
+
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# @param icu_regex [String]
|
10
|
+
# @return [String]
|
11
|
+
def to_ruby(icu_regex)
|
12
|
+
icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param icu_regex [String]
|
16
|
+
# @return [Regexp]
|
17
|
+
def compile(icu_regex)
|
18
|
+
Regexp.new(to_ruby(icu_regex))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<srx version="2.0"
|
3
|
+
xmlns="http://www.lisa.org/srx20"
|
4
|
+
xsi:schemaLocation="http://www.lisa.org/srx20 srx20.xsd"
|
5
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
6
|
+
<header segmentsubflows="yes" cascade="yes">
|
7
|
+
<formathandle type="start" include="no"/>
|
8
|
+
<formathandle type="end" include="yes"/>
|
9
|
+
<formathandle type="isolated" include="yes"/>
|
10
|
+
</header>
|
11
|
+
<body>
|
12
|
+
<languagerules>
|
13
|
+
<languagerule languagerulename="Default">
|
14
|
+
<!-- Common rules for most languages -->
|
15
|
+
<rule break="no">
|
16
|
+
<beforebreak>^\s*[0-9]+\.</beforebreak>
|
17
|
+
<afterbreak>\s</afterbreak>
|
18
|
+
</rule>
|
19
|
+
<rule break="yes">
|
20
|
+
<afterbreak>\n</afterbreak>
|
21
|
+
</rule>
|
22
|
+
<rule break="yes">
|
23
|
+
<beforebreak>[\.\?!]+</beforebreak>
|
24
|
+
<afterbreak>\s</afterbreak>
|
25
|
+
</rule>
|
26
|
+
</languagerule>
|
27
|
+
<languagerule languagerulename="English">
|
28
|
+
<!-- Some English abbreviations -->
|
29
|
+
<rule break="no">
|
30
|
+
<beforebreak>\s[Ee][Tt][Cc]\.</beforebreak>
|
31
|
+
<afterbreak>\s[a-z]</afterbreak>
|
32
|
+
</rule>
|
33
|
+
<rule break="no">
|
34
|
+
<beforebreak>\sMr\.</beforebreak>
|
35
|
+
<afterbreak>\s</afterbreak>
|
36
|
+
</rule>
|
37
|
+
<rule break="no">
|
38
|
+
<beforebreak>\sU\.K\.</beforebreak>
|
39
|
+
<afterbreak>\s</afterbreak>
|
40
|
+
</rule>
|
41
|
+
</languagerule>
|
42
|
+
<languagerule languagerulename="French">
|
43
|
+
<!-- Some French abbreviations -->
|
44
|
+
<rule break="no">
|
45
|
+
<beforebreak>\s[Mm]lle\.</beforebreak>
|
46
|
+
<afterbreak>\s</afterbreak>
|
47
|
+
</rule>
|
48
|
+
<rule break="no">
|
49
|
+
<beforebreak>\s[Mm]lles\.</beforebreak>
|
50
|
+
<afterbreak>\s</afterbreak>
|
51
|
+
</rule>
|
52
|
+
<rule break="no">
|
53
|
+
<beforebreak>\s[Mm]me\.</beforebreak>
|
54
|
+
<afterbreak>\s</afterbreak>
|
55
|
+
</rule>
|
56
|
+
<rule break="no">
|
57
|
+
<beforebreak>\s[Mm]mes\.</beforebreak>
|
58
|
+
<afterbreak>\s</afterbreak>
|
59
|
+
</rule>
|
60
|
+
</languagerule>
|
61
|
+
<languagerule languagerulename="Japanese">
|
62
|
+
<!-- Rules for breaking on Japanese punctuation
|
63
|
+
|
64
|
+
\xff61: Halfwidth ideographic full stop
|
65
|
+
\x3002: Ideographic full stop
|
66
|
+
\xff0e: Fullwidth full stop
|
67
|
+
\xff1f: Fullwidth question mark
|
68
|
+
\xff01: Fullwidth exclamation mark
|
69
|
+
-->
|
70
|
+
<rule break="yes">
|
71
|
+
<beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
|
72
|
+
<afterbreak></afterbreak>
|
73
|
+
</rule>
|
74
|
+
</languagerule>
|
75
|
+
</languagerules>
|
76
|
+
<maprules>
|
77
|
+
<!-- List exceptions first -->
|
78
|
+
<languagemap languagepattern="[Ee][Nn].*" languagerulename="English"/>
|
79
|
+
<languagemap languagepattern="[Ff][Rr].*" languagerulename="French"/>
|
80
|
+
<!-- Japanese breaking rules -->
|
81
|
+
<languagemap languagepattern="[Jj][Aa].*" languagerulename="Japanese"/>
|
82
|
+
<!-- Common breaking rules -->
|
83
|
+
<languagemap languagepattern=".*" languagerulename="Default"/>
|
84
|
+
</maprules>
|
85
|
+
</body>
|
86
|
+
</srx>
|
data/lib/srx/util.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Miscellaneous utility functions
|
5
|
+
module Util
|
6
|
+
class << self
|
7
|
+
# Remove linebreaks that wrap lines.
|
8
|
+
#
|
9
|
+
# @param str [String]
|
10
|
+
# @return [String]
|
11
|
+
def unwrap(str)
|
12
|
+
str.gsub(/(?<=\S)\n(?=\S)/, ' ')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/srx/version.rb
ADDED
data/srx.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/srx/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'srx'
|
7
|
+
spec.version = Srx::VERSION
|
8
|
+
spec.authors = ['Aaron Madlon-Kay']
|
9
|
+
spec.email = ['aaron@madlon-kay.com']
|
10
|
+
|
11
|
+
spec.summary = 'An SRX segmenting engine'
|
12
|
+
spec.homepage = 'https://github.com/amake/srx-ruby'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.4.0')
|
15
|
+
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
17
|
+
spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
spec.bindir = 'exe'
|
25
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
26
|
+
spec.require_paths = ['lib']
|
27
|
+
|
28
|
+
spec.add_dependency 'nokogiri', '~>1.11'
|
29
|
+
|
30
|
+
spec.add_development_dependency 'byebug'
|
31
|
+
spec.add_development_dependency 'memory_profiler'
|
32
|
+
spec.add_development_dependency 'minitest'
|
33
|
+
spec.add_development_dependency 'rake'
|
34
|
+
spec.add_development_dependency 'rspec-expectations'
|
35
|
+
spec.add_development_dependency 'rubocop'
|
36
|
+
spec.add_development_dependency 'solargraph'
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aaron Madlon-Kay
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-02-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.11'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.11'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: byebug
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: memory_profiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec-expectations
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: solargraph
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description:
|
126
|
+
email:
|
127
|
+
- aaron@madlon-kay.com
|
128
|
+
executables: []
|
129
|
+
extensions: []
|
130
|
+
extra_rdoc_files: []
|
131
|
+
files:
|
132
|
+
- ".dir-locals.el"
|
133
|
+
- ".github/workflows/main.yml"
|
134
|
+
- ".gitignore"
|
135
|
+
- ".rubocop.yml"
|
136
|
+
- ".rubocop_todo.yml"
|
137
|
+
- ".solargraph.yml"
|
138
|
+
- Gemfile
|
139
|
+
- Gemfile.lock
|
140
|
+
- LICENSE.txt
|
141
|
+
- README.md
|
142
|
+
- Rakefile
|
143
|
+
- bin/benchmark
|
144
|
+
- bin/console
|
145
|
+
- bin/profile
|
146
|
+
- bin/segment
|
147
|
+
- bin/setup
|
148
|
+
- lib/srx.rb
|
149
|
+
- lib/srx/data.rb
|
150
|
+
- lib/srx/engine.rb
|
151
|
+
- lib/srx/format.rb
|
152
|
+
- lib/srx/format/base_format.rb
|
153
|
+
- lib/srx/format/text.rb
|
154
|
+
- lib/srx/format/xml.rb
|
155
|
+
- lib/srx/icu_regex.rb
|
156
|
+
- lib/srx/srx-20-sample.srx
|
157
|
+
- lib/srx/util.rb
|
158
|
+
- lib/srx/version.rb
|
159
|
+
- srx.gemspec
|
160
|
+
homepage: https://github.com/amake/srx-ruby
|
161
|
+
licenses:
|
162
|
+
- MIT
|
163
|
+
metadata:
|
164
|
+
homepage_uri: https://github.com/amake/srx-ruby
|
165
|
+
source_code_uri: https://github.com/amake/srx-ruby.git
|
166
|
+
post_install_message:
|
167
|
+
rdoc_options: []
|
168
|
+
require_paths:
|
169
|
+
- lib
|
170
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
171
|
+
requirements:
|
172
|
+
- - ">="
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
version: 2.4.0
|
175
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - ">="
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0'
|
180
|
+
requirements: []
|
181
|
+
rubygems_version: 3.1.4
|
182
|
+
signing_key:
|
183
|
+
specification_version: 4
|
184
|
+
summary: An SRX segmenting engine
|
185
|
+
test_files: []
|