srx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dir-locals.el +4 -0
- data/.github/workflows/main.yml +21 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +13 -0
- data/.rubocop_todo.yml +33 -0
- data/.solargraph.yml +17 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/Rakefile +16 -0
- data/bin/benchmark +94 -0
- data/bin/console +15 -0
- data/bin/profile +33 -0
- data/bin/segment +28 -0
- data/bin/setup +8 -0
- data/lib/srx.rb +13 -0
- data/lib/srx/data.rb +169 -0
- data/lib/srx/engine.rb +136 -0
- data/lib/srx/format.rb +26 -0
- data/lib/srx/format/base_format.rb +38 -0
- data/lib/srx/format/text.rb +12 -0
- data/lib/srx/format/xml.rb +53 -0
- data/lib/srx/icu_regex.rb +22 -0
- data/lib/srx/srx-20-sample.srx +86 -0
- data/lib/srx/util.rb +16 -0
- data/lib/srx/version.rb +5 -0
- data/srx.gemspec +37 -0
- metadata +185 -0
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'srx'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/profile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'srx'
|
6
|
+
require 'optparse'
|
7
|
+
require 'memory_profiler'
|
8
|
+
|
9
|
+
options = {}
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
12
|
+
|
13
|
+
opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
|
14
|
+
opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
|
15
|
+
end.parse!(into: options)
|
16
|
+
|
17
|
+
data = if options[:srx]
|
18
|
+
Srx::Data.from_file(path: options[:srx])
|
19
|
+
else
|
20
|
+
Srx::Data.default
|
21
|
+
end
|
22
|
+
format = options[:format]&.to_sym || :text
|
23
|
+
engine = Srx::Engine.new(data, format: format)
|
24
|
+
|
25
|
+
license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
|
26
|
+
|
27
|
+
n = 100
|
28
|
+
|
29
|
+
report = MemoryProfiler.report do
|
30
|
+
n.times { engine.segment(license_text, language: 'en') }
|
31
|
+
end
|
32
|
+
|
33
|
+
report.pretty_print
|
data/bin/segment
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'srx'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options] [file ...]"
|
10
|
+
|
11
|
+
opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
|
12
|
+
opts.on('-lLANGUAGE', '--language LANGUAGE', 'Language of input text (default: en)')
|
13
|
+
end.parse!(into: options)
|
14
|
+
|
15
|
+
language = options[:language] || 'en'
|
16
|
+
data = if options[:srx]
|
17
|
+
Srx::Data.from_file(path: options[:srx])
|
18
|
+
else
|
19
|
+
Srx::Data.default
|
20
|
+
end
|
21
|
+
engine = Srx::Engine.new(data)
|
22
|
+
|
23
|
+
begin
|
24
|
+
text = ARGF.read
|
25
|
+
puts engine.segment(text, language: language)
|
26
|
+
rescue Interrupt
|
27
|
+
exit(1)
|
28
|
+
end
|
data/bin/setup
ADDED
data/lib/srx.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'srx/version'
|
4
|
+
require_relative 'srx/data'
|
5
|
+
require_relative 'srx/engine'
|
6
|
+
require_relative 'srx/icu_regex'
|
7
|
+
require_relative 'srx/util'
|
8
|
+
require_relative 'srx/format'
|
9
|
+
|
10
|
+
module Srx
|
11
|
+
class Error < StandardError; end
|
12
|
+
# Your code goes here...
|
13
|
+
end
|
data/lib/srx/data.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module Srx
|
6
|
+
# Abstract convenience wrapper around Nokogiri XML node
|
7
|
+
class XmlWrapper
|
8
|
+
NS = { 'srx' => 'http://www.lisa.org/srx20' }.freeze
|
9
|
+
|
10
|
+
def initialize(xml)
|
11
|
+
@xml = xml
|
12
|
+
end
|
13
|
+
|
14
|
+
def xpath(*segments)
|
15
|
+
path = ['.', *segments].join('/srx:')
|
16
|
+
@xml.xpath(path, NS)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private_constant :XmlWrapper
|
21
|
+
|
22
|
+
# SRX data
|
23
|
+
class Data < XmlWrapper
|
24
|
+
class << self
|
25
|
+
# Default SRX rules
|
26
|
+
#
|
27
|
+
# @return [Data]
|
28
|
+
def default
|
29
|
+
from_file(path: File.expand_path('srx-20-sample.srx', __dir__))
|
30
|
+
end
|
31
|
+
|
32
|
+
# @param path [String]
|
33
|
+
# @return [Data]
|
34
|
+
def from_file(path:)
|
35
|
+
File.open(path, &method(:from_io))
|
36
|
+
end
|
37
|
+
|
38
|
+
# @param io [IO]
|
39
|
+
# @return [Data]
|
40
|
+
def from_io(io)
|
41
|
+
new(Nokogiri::XML.parse(io))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def segment_subflows?
|
46
|
+
header['segmentsubflows'] == 'yes'
|
47
|
+
end
|
48
|
+
|
49
|
+
def cascade?
|
50
|
+
header['cascade'] == 'yes'
|
51
|
+
end
|
52
|
+
|
53
|
+
def include_start_formatting?
|
54
|
+
include_formatting?(:start)
|
55
|
+
end
|
56
|
+
|
57
|
+
def include_end_formatting?
|
58
|
+
include_formatting?(:end)
|
59
|
+
end
|
60
|
+
|
61
|
+
def include_isolated_formatting?
|
62
|
+
include_formatting?(:isolated)
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Array<LanguageRule>]
|
66
|
+
def language_rules
|
67
|
+
@language_rules ||=
|
68
|
+
xpath(:srx, :body, :languagerules, :languagerule)
|
69
|
+
.map { |langrule| LanguageRule.new(langrule) }
|
70
|
+
end
|
71
|
+
|
72
|
+
# @return [Array<LanguageMap>]
|
73
|
+
def map_rules
|
74
|
+
@map_rules ||=
|
75
|
+
xpath(:srx, :body, :maprules, :languagemap)
|
76
|
+
.map { |maprule| LanguageMap.new(maprule) }
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def header
|
82
|
+
@header ||= xpath(:srx, :header).first
|
83
|
+
end
|
84
|
+
|
85
|
+
# @param type [Symbol]
|
86
|
+
def format_handle(type)
|
87
|
+
xpath(:srx, :header, "formathandle[@type='#{type}']").first
|
88
|
+
end
|
89
|
+
|
90
|
+
# @param type [Symbol]
|
91
|
+
def include_formatting?(type)
|
92
|
+
elem = format_handle(type)
|
93
|
+
|
94
|
+
return elem['include'] == 'yes' if elem
|
95
|
+
|
96
|
+
# Defaults are
|
97
|
+
# <formathandle type="start" include="no"/>
|
98
|
+
# <formathandle type="end" include="yes"/>
|
99
|
+
# <formathandle type="isolated" include="no"/>
|
100
|
+
case type
|
101
|
+
when %i[start isolated] then false
|
102
|
+
when :end then true
|
103
|
+
else raise(ArgumentError, "Unknown formatting type: #{type}")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# SRX <languagerule> element
|
108
|
+
class LanguageRule < XmlWrapper
|
109
|
+
# @return [String]
|
110
|
+
def name
|
111
|
+
@xml['languagerulename']
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Array<Rule>]
|
115
|
+
def rules
|
116
|
+
@rules ||= xpath(:rule).map { |rule| Rule.new(rule) }
|
117
|
+
end
|
118
|
+
|
119
|
+
# SRX <rule> element
|
120
|
+
class Rule < XmlWrapper
|
121
|
+
# @return [Regexp,nil]
|
122
|
+
attr_reader :before_break
|
123
|
+
|
124
|
+
# @return [Regexp,nil]
|
125
|
+
attr_reader :after_break
|
126
|
+
|
127
|
+
def initialize(xml)
|
128
|
+
super(xml)
|
129
|
+
|
130
|
+
# Eagerly load everything for this class because before_break and
|
131
|
+
# after_break can be legitimately nil, so lazy loading gets ugly.
|
132
|
+
|
133
|
+
@break = @xml['break'].nil? || @xml['break'] == 'yes'
|
134
|
+
|
135
|
+
@before_break = xpath(:beforebreak).first&.text.then do |pattern|
|
136
|
+
IcuRegex.compile(pattern) if pattern
|
137
|
+
end
|
138
|
+
|
139
|
+
@after_break ||= xpath(:afterbreak).first&.text.then do |pattern|
|
140
|
+
IcuRegex.compile(pattern) if pattern
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def break?
|
145
|
+
@break
|
146
|
+
end
|
147
|
+
|
148
|
+
def inspect
|
149
|
+
"Rule[break=#{break?},before=#{before_break},after=#{after_break}]"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# SRX <languagemap> element
|
155
|
+
class LanguageMap < XmlWrapper
|
156
|
+
# @return [String]
|
157
|
+
def language_rule_name
|
158
|
+
@xml['languagerulename']
|
159
|
+
end
|
160
|
+
|
161
|
+
# @return [Regexp]
|
162
|
+
def language_pattern
|
163
|
+
@language_pattern ||= @xml['languagepattern'].then do |pattern|
|
164
|
+
IcuRegex.compile(pattern) if pattern
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/srx/engine.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Engine for performing SRX segmenting
|
5
|
+
class Engine
|
6
|
+
# @return [Data]
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
# @param data [Data]
|
10
|
+
# @param markup [Regexp]
|
11
|
+
def initialize(data, format: :text)
|
12
|
+
@data = data
|
13
|
+
@format = Format.get(format)
|
14
|
+
end
|
15
|
+
|
16
|
+
# @param str [String]
|
17
|
+
# @param language [String]
|
18
|
+
# @return [Array<String>]
|
19
|
+
def segment(str, language:)
|
20
|
+
results = []
|
21
|
+
rules = rules(language)
|
22
|
+
|
23
|
+
plain_text, markups = @format.extract_markups(str)
|
24
|
+
|
25
|
+
pos = 0
|
26
|
+
breaks_by_pos(plain_text, rules).each do |break_pos, _|
|
27
|
+
results << build_segment!(plain_text, markups, pos, break_pos)
|
28
|
+
pos = break_pos
|
29
|
+
end
|
30
|
+
|
31
|
+
results
|
32
|
+
end
|
33
|
+
|
34
|
+
# @param language [String]
|
35
|
+
# @return [Array<Data::Rule>]
|
36
|
+
def rules(language)
|
37
|
+
names = rule_names(language)
|
38
|
+
|
39
|
+
rule_map = @data.language_rules.map do |rule|
|
40
|
+
[rule.name, rule]
|
41
|
+
end.to_h
|
42
|
+
|
43
|
+
names.flat_map { |name| rule_map[name].rules }
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param language [String]
|
47
|
+
# @return [Array<String>]
|
48
|
+
def rule_names(language)
|
49
|
+
@data.map_rules.map do |lang_map|
|
50
|
+
next unless lang_map.language_pattern.match?(language)
|
51
|
+
|
52
|
+
break [lang_map.language_rule_name] unless @data.cascade?
|
53
|
+
|
54
|
+
lang_map.language_rule_name
|
55
|
+
end.compact
|
56
|
+
end
|
57
|
+
|
58
|
+
# @param str [String]
|
59
|
+
# @param pos [Integer] the position to start searching from
|
60
|
+
# @param rules [Array<Data::LanguageRule::Rule>]
|
61
|
+
# @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
|
62
|
+
# position of a break, and 2) the rule that matched at that position. Note
|
63
|
+
# that the final break will always be at the end of the string and may not
|
64
|
+
# have an associated rule.
|
65
|
+
def breaks_by_pos(str, rules)
|
66
|
+
rules
|
67
|
+
.flat_map { |rule| all_matches(str, rule) }
|
68
|
+
.group_by(&:first)
|
69
|
+
.transform_values { |pairs| pairs.first.last }
|
70
|
+
.select { |_pos, rule| rule.break? }
|
71
|
+
.sort_by(&:first)
|
72
|
+
.tap { |breaks| breaks << [str.length] unless breaks&.last&.first == str.length }
|
73
|
+
end
|
74
|
+
|
75
|
+
# @param str [String]
|
76
|
+
# @param rule [Data::LanguageRule::Rule]
|
77
|
+
# @return [Array<Array(Integer,Data::LanguageRule::Rule)>]
|
78
|
+
def all_matches(str, rule)
|
79
|
+
results = []
|
80
|
+
|
81
|
+
pos = 0
|
82
|
+
while pos < str.length
|
83
|
+
if rule.before_break
|
84
|
+
m = rule.before_break.match(str, pos)
|
85
|
+
break unless m
|
86
|
+
|
87
|
+
pos = m.end(0)
|
88
|
+
pos += 1 if pos == m.begin(0)
|
89
|
+
|
90
|
+
results << [pos, rule] if rule.after_break.nil? || m.post_match.start_with?(rule.after_break)
|
91
|
+
elsif rule.after_break
|
92
|
+
m = rule.after_break.match(str, pos)
|
93
|
+
break unless m
|
94
|
+
|
95
|
+
pos = m.begin(0) + 1
|
96
|
+
results << [pos, rule]
|
97
|
+
else
|
98
|
+
raise('Rule has neither before_break nor after_break')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
results
|
103
|
+
end
|
104
|
+
|
105
|
+
# @param str [String]
|
106
|
+
# @param markups [Array<Array(Integer,String)>]
|
107
|
+
# @param start [Integer] start offset of segment in str
|
108
|
+
# @param finish [Integer] end offset of segment in str
|
109
|
+
def build_segment!(str, markups, start, finish)
|
110
|
+
segment = str[start...finish]
|
111
|
+
|
112
|
+
until markups.empty?
|
113
|
+
markup_pos, markup = markups.first
|
114
|
+
break unless start + segment.length >= markup_pos
|
115
|
+
|
116
|
+
break if start + segment.length == markup_pos && !include_edge_formatting?(markup)
|
117
|
+
|
118
|
+
segment.insert(markup_pos - start, markup)
|
119
|
+
markups.shift
|
120
|
+
end
|
121
|
+
|
122
|
+
segment
|
123
|
+
end
|
124
|
+
|
125
|
+
# @param markup [String]
|
126
|
+
# @return [Boolean] whether to include the specified edge markup in the
|
127
|
+
# current segment, in accordance with <formathandle> rules
|
128
|
+
def include_edge_formatting?(markup)
|
129
|
+
return false if !@data.include_start_formatting? && @format.start_formatting?(markup)
|
130
|
+
return false if !@data.include_end_formatting? && @format.end_formatting?(markup)
|
131
|
+
return false if !@data.include_isolated_formatting? && @format.isolated_formatting?(markup)
|
132
|
+
|
133
|
+
true
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/srx/format.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'format/base_format'
|
4
|
+
require_relative 'format/text'
|
5
|
+
require_relative 'format/xml'
|
6
|
+
|
7
|
+
module Srx
|
8
|
+
# Format-specific data and logic
|
9
|
+
module Format
|
10
|
+
FORMATS = {
|
11
|
+
text: Text.new,
|
12
|
+
xml: Xml.new,
|
13
|
+
html: Xml.new # TODO: specialize for HTML
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
class << self
|
17
|
+
# @param format [Symbol]
|
18
|
+
# @return [BaseFormat]
|
19
|
+
def get(format)
|
20
|
+
raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
|
21
|
+
|
22
|
+
FORMATS[format]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
module Format
|
5
|
+
# Interface definition for format support
|
6
|
+
class BaseFormat
|
7
|
+
# @abstract
|
8
|
+
# @param str [String]
|
9
|
+
# @return [Array(String,Array<Array(Integer,String)>)] two items: 1) input
|
10
|
+
# +str+ with markups removed, and 2) a list of markups, i.e. +[pos,
|
11
|
+
# string]+ pairs
|
12
|
+
def extract_markups(str)
|
13
|
+
raise(NotImplementedError)
|
14
|
+
end
|
15
|
+
|
16
|
+
# @abstract
|
17
|
+
# @param markup [String]
|
18
|
+
# @return [Boolean]
|
19
|
+
def start_formatting?(markup)
|
20
|
+
raise(NotImplementedError)
|
21
|
+
end
|
22
|
+
|
23
|
+
# @abstract
|
24
|
+
# @param markup [String]
|
25
|
+
# @return [Boolean]
|
26
|
+
def end_formatting?(markup)
|
27
|
+
raise(NotImplementedError)
|
28
|
+
end
|
29
|
+
|
30
|
+
# @abstract
|
31
|
+
# @param markup [String]
|
32
|
+
# @return [Boolean]
|
33
|
+
def isolated_formatting?(markup)
|
34
|
+
raise(NotImplementedError)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|