srx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ module Format
5
+ # Support for plain text
6
+ class Text < BaseFormat
7
+ def extract_markups(str)
8
+ [str, []]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'English'
4
+
5
+ module Srx
6
+ module Format
7
+ # Support for XML
8
+ #
9
+ # @see https://www.w3.org/TR/xml/
10
+ class Xml < BaseFormat
11
+ # rubocop:disable Layout/LineLength
12
+ NAME_START_CHAR = /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u{10000}-\u{EFFFF}]/.freeze
13
+ # rubocop:enable Layout/LineLength
14
+ NAME_CHAR = /#{NAME_START_CHAR}|[-.0-9\u00B7\u0300-\u036F\u203F-\u2040]/.freeze
15
+ NAME = /#{NAME_START_CHAR}#{NAME_CHAR}*/.freeze
16
+ SPACE = /[\u0020\u0009\u000D\u000A]+/.freeze
17
+ EQUALS = /#{SPACE}?=#{SPACE}?/.freeze
18
+ ENTITY_REF = /&#{NAME};/.freeze
19
+ CHAR_REF = /&#[0-9]+;|&#x[0-9a-fA-F]+;/.freeze
20
+ REFERENCE = /#{ENTITY_REF}|#{CHAR_REF}/.freeze
21
+ ATT_VALUE = /"(?:[^<&"]|#{REFERENCE})*"|'(?:[^<&']|#{REFERENCE})*'/.freeze
22
+ ATTRIBUTE = /#{NAME}#{EQUALS}#{ATT_VALUE}/.freeze
23
+ START_TAG = /<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?>/.freeze
24
+ END_TAG = %r{</#{NAME}#{SPACE}?>}.freeze
25
+ EMPTY_ELEM_TAG = %r{<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?/>}.freeze
26
+
27
+ TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
28
+
29
+ def extract_markups(str)
30
+ markups = []
31
+
32
+ plain_text = str.gsub(TAG) do |match|
33
+ markups << [$LAST_MATCH_INFO.begin(0), match]
34
+ ''
35
+ end
36
+
37
+ [plain_text, markups]
38
+ end
39
+
40
+ def start_formatting?(markup)
41
+ START_TAG.match?(markup)
42
+ end
43
+
44
+ def end_formatting?(markup)
45
+ END_TAG.match?(markup)
46
+ end
47
+
48
+ def isolated_formatting?(markup)
49
+ EMPTY_ELEM_TAG.match?(markup)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Utilities for handling SRX (ICU) regular expressions
5
+ module IcuRegex
6
+ HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
7
+
8
+ class << self
9
+ # @param icu_regex [String]
10
+ # @return [String]
11
+ def to_ruby(icu_regex)
12
+ icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
13
+ end
14
+
15
+ # @param icu_regex [String]
16
+ # @return [Regexp]
17
+ def compile(icu_regex)
18
+ Regexp.new(to_ruby(icu_regex))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,86 @@
1
+ <?xml version="1.0"?>
2
+ <srx version="2.0"
3
+ xmlns="http://www.lisa.org/srx20"
4
+ xsi:schemaLocation="http://www.lisa.org/srx20 srx20.xsd"
5
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
6
+ <header segmentsubflows="yes" cascade="yes">
7
+ <formathandle type="start" include="no"/>
8
+ <formathandle type="end" include="yes"/>
9
+ <formathandle type="isolated" include="yes"/>
10
+ </header>
11
+ <body>
12
+ <languagerules>
13
+ <languagerule languagerulename="Default">
14
+ <!-- Common rules for most languages -->
15
+ <rule break="no">
16
+ <beforebreak>^\s*[0-9]+\.</beforebreak>
17
+ <afterbreak>\s</afterbreak>
18
+ </rule>
19
+ <rule break="yes">
20
+ <afterbreak>\n</afterbreak>
21
+ </rule>
22
+ <rule break="yes">
23
+ <beforebreak>[\.\?!]+</beforebreak>
24
+ <afterbreak>\s</afterbreak>
25
+ </rule>
26
+ </languagerule>
27
+ <languagerule languagerulename="English">
28
+ <!-- Some English abbreviations -->
29
+ <rule break="no">
30
+ <beforebreak>\s[Ee][Tt][Cc]\.</beforebreak>
31
+ <afterbreak>\s[a-z]</afterbreak>
32
+ </rule>
33
+ <rule break="no">
34
+ <beforebreak>\sMr\.</beforebreak>
35
+ <afterbreak>\s</afterbreak>
36
+ </rule>
37
+ <rule break="no">
38
+ <beforebreak>\sU\.K\.</beforebreak>
39
+ <afterbreak>\s</afterbreak>
40
+ </rule>
41
+ </languagerule>
42
+ <languagerule languagerulename="French">
43
+ <!-- Some French abbreviations -->
44
+ <rule break="no">
45
+ <beforebreak>\s[Mm]lle\.</beforebreak>
46
+ <afterbreak>\s</afterbreak>
47
+ </rule>
48
+ <rule break="no">
49
+ <beforebreak>\s[Mm]lles\.</beforebreak>
50
+ <afterbreak>\s</afterbreak>
51
+ </rule>
52
+ <rule break="no">
53
+ <beforebreak>\s[Mm]me\.</beforebreak>
54
+ <afterbreak>\s</afterbreak>
55
+ </rule>
56
+ <rule break="no">
57
+ <beforebreak>\s[Mm]mes\.</beforebreak>
58
+ <afterbreak>\s</afterbreak>
59
+ </rule>
60
+ </languagerule>
61
+ <languagerule languagerulename="Japanese">
62
+ <!-- Rules for breaking on Japanese punctuation
63
+
64
+ \xff61: Halfwidth ideographic full stop
65
+ \x3002: Ideographic full stop
66
+ \xff0e: Fullwidth full stop
67
+ \xff1f: Fullwidth question mark
68
+ \xff01: Fullwidth exclamation mark
69
+ -->
70
+ <rule break="yes">
71
+ <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
72
+ <afterbreak></afterbreak>
73
+ </rule>
74
+ </languagerule>
75
+ </languagerules>
76
+ <maprules>
77
+ <!-- List exceptions first -->
78
+ <languagemap languagepattern="[Ee][Nn].*" languagerulename="English"/>
79
+ <languagemap languagepattern="[Ff][Rr].*" languagerulename="French"/>
80
+ <!-- Japanese breaking rules -->
81
+ <languagemap languagepattern="[Jj][Aa].*" languagerulename="Japanese"/>
82
+ <!-- Common breaking rules -->
83
+ <languagemap languagepattern=".*" languagerulename="Default"/>
84
+ </maprules>
85
+ </body>
86
+ </srx>
data/lib/srx/util.rb ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Miscellaneous utility functions
5
+ module Util
6
+ class << self
7
+ # Remove linebreaks that wrap lines.
8
+ #
9
+ # @param str [String]
10
+ # @return [String]
11
+ def unwrap(str)
12
+ str.gsub(/(?<=\S)\n(?=\S)/, ' ')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ VERSION = '0.1.0'
5
+ end
data/srx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/srx/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'srx'
7
+ spec.version = Srx::VERSION
8
+ spec.authors = ['Aaron Madlon-Kay']
9
+ spec.email = ['aaron@madlon-kay.com']
10
+
11
+ spec.summary = 'An SRX segmenting engine'
12
+ spec.homepage = 'https://github.com/amake/srx-ruby'
13
+ spec.license = 'MIT'
14
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.4.0')
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+ spec.bindir = 'exe'
25
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_dependency 'nokogiri', '~>1.11'
29
+
30
+ spec.add_development_dependency 'byebug'
31
+ spec.add_development_dependency 'memory_profiler'
32
+ spec.add_development_dependency 'minitest'
33
+ spec.add_development_dependency 'rake'
34
+ spec.add_development_dependency 'rspec-expectations'
35
+ spec.add_development_dependency 'rubocop'
36
+ spec.add_development_dependency 'solargraph'
37
+ end
metadata ADDED
@@ -0,0 +1,185 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Madlon-Kay
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-02-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: byebug
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: memory_profiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec-expectations
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: solargraph
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ - aaron@madlon-kay.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".dir-locals.el"
133
+ - ".github/workflows/main.yml"
134
+ - ".gitignore"
135
+ - ".rubocop.yml"
136
+ - ".rubocop_todo.yml"
137
+ - ".solargraph.yml"
138
+ - Gemfile
139
+ - Gemfile.lock
140
+ - LICENSE.txt
141
+ - README.md
142
+ - Rakefile
143
+ - bin/benchmark
144
+ - bin/console
145
+ - bin/profile
146
+ - bin/segment
147
+ - bin/setup
148
+ - lib/srx.rb
149
+ - lib/srx/data.rb
150
+ - lib/srx/engine.rb
151
+ - lib/srx/format.rb
152
+ - lib/srx/format/base_format.rb
153
+ - lib/srx/format/text.rb
154
+ - lib/srx/format/xml.rb
155
+ - lib/srx/icu_regex.rb
156
+ - lib/srx/srx-20-sample.srx
157
+ - lib/srx/util.rb
158
+ - lib/srx/version.rb
159
+ - srx.gemspec
160
+ homepage: https://github.com/amake/srx-ruby
161
+ licenses:
162
+ - MIT
163
+ metadata:
164
+ homepage_uri: https://github.com/amake/srx-ruby
165
+ source_code_uri: https://github.com/amake/srx-ruby.git
166
+ post_install_message:
167
+ rdoc_options: []
168
+ require_paths:
169
+ - lib
170
+ required_ruby_version: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: 2.4.0
175
+ required_rubygems_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ requirements: []
181
+ rubygems_version: 3.1.4
182
+ signing_key:
183
+ specification_version: 4
184
+ summary: An SRX segmenting engine
185
+ test_files: []