srx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ module Format
5
+ # Support for plain text
6
+ class Text < BaseFormat
7
+ def extract_markups(str)
8
+ [str, []]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'English'
4
+
5
+ module Srx
6
+ module Format
7
+ # Support for XML
8
+ #
9
+ # @see https://www.w3.org/TR/xml/
10
+ class Xml < BaseFormat
11
+ # rubocop:disable Layout/LineLength
12
+ NAME_START_CHAR = /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u{10000}-\u{EFFFF}]/.freeze
13
+ # rubocop:enable Layout/LineLength
14
+ NAME_CHAR = /#{NAME_START_CHAR}|[-.0-9\u00B7\u0300-\u036F\u203F-\u2040]/.freeze
15
+ NAME = /#{NAME_START_CHAR}#{NAME_CHAR}*/.freeze
16
+ SPACE = /[\u0020\u0009\u000D\u000A]+/.freeze
17
+ EQUALS = /#{SPACE}?=#{SPACE}?/.freeze
18
+ ENTITY_REF = /&#{NAME};/.freeze
19
+ CHAR_REF = /&#[0-9]+;|&#x[0-9a-fA-F]+;/.freeze
20
+ REFERENCE = /#{ENTITY_REF}|#{CHAR_REF}/.freeze
21
+ ATT_VALUE = /"(?:[^<&"]|#{REFERENCE})*"|'(?:[^<&']|#{REFERENCE})*'/.freeze
22
+ ATTRIBUTE = /#{NAME}#{EQUALS}#{ATT_VALUE}/.freeze
23
+ START_TAG = /<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?>/.freeze
24
+ END_TAG = %r{</#{NAME}#{SPACE}?>}.freeze
25
+ EMPTY_ELEM_TAG = %r{<#{NAME}(?:#{SPACE}#{ATTRIBUTE})*#{SPACE}?/>}.freeze
26
+
27
+ TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
28
+
29
+ def extract_markups(str)
30
+ markups = []
31
+
32
+ plain_text = str.gsub(TAG) do |match|
33
+ markups << [$LAST_MATCH_INFO.begin(0), match]
34
+ ''
35
+ end
36
+
37
+ [plain_text, markups]
38
+ end
39
+
40
+ def start_formatting?(markup)
41
+ START_TAG.match?(markup)
42
+ end
43
+
44
+ def end_formatting?(markup)
45
+ END_TAG.match?(markup)
46
+ end
47
+
48
+ def isolated_formatting?(markup)
49
+ EMPTY_ELEM_TAG.match?(markup)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Utilities for handling SRX (ICU) regular expressions
5
+ module IcuRegex
6
+ HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
7
+
8
+ class << self
9
+ # @param icu_regex [String]
10
+ # @return [String]
11
+ def to_ruby(icu_regex)
12
+ icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
13
+ end
14
+
15
+ # @param icu_regex [String]
16
+ # @return [Regexp]
17
+ def compile(icu_regex)
18
+ Regexp.new(to_ruby(icu_regex))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,86 @@
1
+ <?xml version="1.0"?>
2
+ <srx version="2.0"
3
+ xmlns="http://www.lisa.org/srx20"
4
+ xsi:schemaLocation="http://www.lisa.org/srx20 srx20.xsd"
5
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
6
+ <header segmentsubflows="yes" cascade="yes">
7
+ <formathandle type="start" include="no"/>
8
+ <formathandle type="end" include="yes"/>
9
+ <formathandle type="isolated" include="yes"/>
10
+ </header>
11
+ <body>
12
+ <languagerules>
13
+ <languagerule languagerulename="Default">
14
+ <!-- Common rules for most languages -->
15
+ <rule break="no">
16
+ <beforebreak>^\s*[0-9]+\.</beforebreak>
17
+ <afterbreak>\s</afterbreak>
18
+ </rule>
19
+ <rule break="yes">
20
+ <afterbreak>\n</afterbreak>
21
+ </rule>
22
+ <rule break="yes">
23
+ <beforebreak>[\.\?!]+</beforebreak>
24
+ <afterbreak>\s</afterbreak>
25
+ </rule>
26
+ </languagerule>
27
+ <languagerule languagerulename="English">
28
+ <!-- Some English abbreviations -->
29
+ <rule break="no">
30
+ <beforebreak>\s[Ee][Tt][Cc]\.</beforebreak>
31
+ <afterbreak>\s[a-z]</afterbreak>
32
+ </rule>
33
+ <rule break="no">
34
+ <beforebreak>\sMr\.</beforebreak>
35
+ <afterbreak>\s</afterbreak>
36
+ </rule>
37
+ <rule break="no">
38
+ <beforebreak>\sU\.K\.</beforebreak>
39
+ <afterbreak>\s</afterbreak>
40
+ </rule>
41
+ </languagerule>
42
+ <languagerule languagerulename="French">
43
+ <!-- Some French abbreviations -->
44
+ <rule break="no">
45
+ <beforebreak>\s[Mm]lle\.</beforebreak>
46
+ <afterbreak>\s</afterbreak>
47
+ </rule>
48
+ <rule break="no">
49
+ <beforebreak>\s[Mm]lles\.</beforebreak>
50
+ <afterbreak>\s</afterbreak>
51
+ </rule>
52
+ <rule break="no">
53
+ <beforebreak>\s[Mm]me\.</beforebreak>
54
+ <afterbreak>\s</afterbreak>
55
+ </rule>
56
+ <rule break="no">
57
+ <beforebreak>\s[Mm]mes\.</beforebreak>
58
+ <afterbreak>\s</afterbreak>
59
+ </rule>
60
+ </languagerule>
61
+ <languagerule languagerulename="Japanese">
62
+ <!-- Rules for breaking on Japanese punctuation
63
+
64
+ \xff61: Halfwidth ideographic full stop
65
+ \x3002: Ideographic full stop
66
+ \xff0e: Fullwidth full stop
67
+ \xff1f: Fullwidth question mark
68
+ \xff01: Fullwidth exclamation mark
69
+ -->
70
+ <rule break="yes">
71
+ <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
72
+ <afterbreak></afterbreak>
73
+ </rule>
74
+ </languagerule>
75
+ </languagerules>
76
+ <maprules>
77
+ <!-- List exceptions first -->
78
+ <languagemap languagepattern="[Ee][Nn].*" languagerulename="English"/>
79
+ <languagemap languagepattern="[Ff][Rr].*" languagerulename="French"/>
80
+ <!-- Japanese breaking rules -->
81
+ <languagemap languagepattern="[Jj][Aa].*" languagerulename="Japanese"/>
82
+ <!-- Common breaking rules -->
83
+ <languagemap languagepattern=".*" languagerulename="Default"/>
84
+ </maprules>
85
+ </body>
86
+ </srx>
data/lib/srx/util.rb ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Miscellaneous utility functions
5
+ module Util
6
+ class << self
7
+ # Remove linebreaks that wrap lines.
8
+ #
9
+ # @param str [String]
10
+ # @return [String]
11
+ def unwrap(str)
12
+ str.gsub(/(?<=\S)\n(?=\S)/, ' ')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ VERSION = '0.1.0'
5
+ end
data/srx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/srx/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'srx'
7
+ spec.version = Srx::VERSION
8
+ spec.authors = ['Aaron Madlon-Kay']
9
+ spec.email = ['aaron@madlon-kay.com']
10
+
11
+ spec.summary = 'An SRX segmenting engine'
12
+ spec.homepage = 'https://github.com/amake/srx-ruby'
13
+ spec.license = 'MIT'
14
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.4.0')
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+ spec.bindir = 'exe'
25
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_dependency 'nokogiri', '~>1.11'
29
+
30
+ spec.add_development_dependency 'byebug'
31
+ spec.add_development_dependency 'memory_profiler'
32
+ spec.add_development_dependency 'minitest'
33
+ spec.add_development_dependency 'rake'
34
+ spec.add_development_dependency 'rspec-expectations'
35
+ spec.add_development_dependency 'rubocop'
36
+ spec.add_development_dependency 'solargraph'
37
+ end
metadata ADDED
@@ -0,0 +1,185 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Madlon-Kay
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-02-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: byebug
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: memory_profiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec-expectations
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: solargraph
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ - aaron@madlon-kay.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".dir-locals.el"
133
+ - ".github/workflows/main.yml"
134
+ - ".gitignore"
135
+ - ".rubocop.yml"
136
+ - ".rubocop_todo.yml"
137
+ - ".solargraph.yml"
138
+ - Gemfile
139
+ - Gemfile.lock
140
+ - LICENSE.txt
141
+ - README.md
142
+ - Rakefile
143
+ - bin/benchmark
144
+ - bin/console
145
+ - bin/profile
146
+ - bin/segment
147
+ - bin/setup
148
+ - lib/srx.rb
149
+ - lib/srx/data.rb
150
+ - lib/srx/engine.rb
151
+ - lib/srx/format.rb
152
+ - lib/srx/format/base_format.rb
153
+ - lib/srx/format/text.rb
154
+ - lib/srx/format/xml.rb
155
+ - lib/srx/icu_regex.rb
156
+ - lib/srx/srx-20-sample.srx
157
+ - lib/srx/util.rb
158
+ - lib/srx/version.rb
159
+ - srx.gemspec
160
+ homepage: https://github.com/amake/srx-ruby
161
+ licenses:
162
+ - MIT
163
+ metadata:
164
+ homepage_uri: https://github.com/amake/srx-ruby
165
+ source_code_uri: https://github.com/amake/srx-ruby.git
166
+ post_install_message:
167
+ rdoc_options: []
168
+ require_paths:
169
+ - lib
170
+ required_ruby_version: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: 2.4.0
175
+ required_rubygems_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ requirements: []
181
+ rubygems_version: 3.1.4
182
+ signing_key:
183
+ specification_version: 4
184
+ summary: An SRX segmenting engine
185
+ test_files: []