srx 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dir-locals.el +4 -0
- data/.github/workflows/main.yml +21 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +13 -0
- data/.rubocop_todo.yml +33 -0
- data/.solargraph.yml +17 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/Rakefile +16 -0
- data/bin/benchmark +94 -0
- data/bin/console +15 -0
- data/bin/profile +33 -0
- data/bin/segment +28 -0
- data/bin/setup +8 -0
- data/lib/srx.rb +13 -0
- data/lib/srx/data.rb +169 -0
- data/lib/srx/engine.rb +136 -0
- data/lib/srx/format.rb +26 -0
- data/lib/srx/format/base_format.rb +38 -0
- data/lib/srx/format/text.rb +12 -0
- data/lib/srx/format/xml.rb +53 -0
- data/lib/srx/icu_regex.rb +22 -0
- data/lib/srx/srx-20-sample.srx +86 -0
- data/lib/srx/util.rb +16 -0
- data/lib/srx/version.rb +5 -0
- data/srx.gemspec +37 -0
- metadata +185 -0
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'srx'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/profile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'srx'
|
6
|
+
require 'optparse'
|
7
|
+
require 'memory_profiler'
|
8
|
+
|
9
|
+
options = {}
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
12
|
+
|
13
|
+
opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
|
14
|
+
opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
|
15
|
+
end.parse!(into: options)
|
16
|
+
|
17
|
+
data = if options[:srx]
|
18
|
+
Srx::Data.from_file(path: options[:srx])
|
19
|
+
else
|
20
|
+
Srx::Data.default
|
21
|
+
end
|
22
|
+
format = options[:format]&.to_sym || :text
|
23
|
+
engine = Srx::Engine.new(data, format: format)
|
24
|
+
|
25
|
+
license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
|
26
|
+
|
27
|
+
n = 100
|
28
|
+
|
29
|
+
report = MemoryProfiler.report do
|
30
|
+
n.times { engine.segment(license_text, language: 'en') }
|
31
|
+
end
|
32
|
+
|
33
|
+
report.pretty_print
|
data/bin/segment
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'srx'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options] [file ...]"
|
10
|
+
|
11
|
+
opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
|
12
|
+
opts.on('-lLANGUAGE', '--language LANGUAGE', 'Language of input text (default: en)')
|
13
|
+
end.parse!(into: options)
|
14
|
+
|
15
|
+
language = options[:language] || 'en'
|
16
|
+
data = if options[:srx]
|
17
|
+
Srx::Data.from_file(path: options[:srx])
|
18
|
+
else
|
19
|
+
Srx::Data.default
|
20
|
+
end
|
21
|
+
engine = Srx::Engine.new(data)
|
22
|
+
|
23
|
+
begin
|
24
|
+
text = ARGF.read
|
25
|
+
puts engine.segment(text, language: language)
|
26
|
+
rescue Interrupt
|
27
|
+
exit(1)
|
28
|
+
end
|
data/bin/setup
ADDED
data/lib/srx.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'srx/version'
|
4
|
+
require_relative 'srx/data'
|
5
|
+
require_relative 'srx/engine'
|
6
|
+
require_relative 'srx/icu_regex'
|
7
|
+
require_relative 'srx/util'
|
8
|
+
require_relative 'srx/format'
|
9
|
+
|
10
|
+
module Srx
|
11
|
+
class Error < StandardError; end
|
12
|
+
# Your code goes here...
|
13
|
+
end
|
data/lib/srx/data.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module Srx
|
6
|
+
# Abstract convenience wrapper around Nokogiri XML node
|
7
|
+
class XmlWrapper
|
8
|
+
NS = { 'srx' => 'http://www.lisa.org/srx20' }.freeze
|
9
|
+
|
10
|
+
def initialize(xml)
|
11
|
+
@xml = xml
|
12
|
+
end
|
13
|
+
|
14
|
+
def xpath(*segments)
|
15
|
+
path = ['.', *segments].join('/srx:')
|
16
|
+
@xml.xpath(path, NS)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private_constant :XmlWrapper
|
21
|
+
|
22
|
+
# SRX data
|
23
|
+
class Data < XmlWrapper
|
24
|
+
class << self
|
25
|
+
# Default SRX rules
|
26
|
+
#
|
27
|
+
# @return [Data]
|
28
|
+
def default
|
29
|
+
from_file(path: File.expand_path('srx-20-sample.srx', __dir__))
|
30
|
+
end
|
31
|
+
|
32
|
+
# @param path [String]
|
33
|
+
# @return [Data]
|
34
|
+
def from_file(path:)
|
35
|
+
File.open(path, &method(:from_io))
|
36
|
+
end
|
37
|
+
|
38
|
+
# @param io [IO]
|
39
|
+
# @return [Data]
|
40
|
+
def from_io(io)
|
41
|
+
new(Nokogiri::XML.parse(io))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def segment_subflows?
|
46
|
+
header['segmentsubflows'] == 'yes'
|
47
|
+
end
|
48
|
+
|
49
|
+
def cascade?
|
50
|
+
header['cascade'] == 'yes'
|
51
|
+
end
|
52
|
+
|
53
|
+
def include_start_formatting?
|
54
|
+
include_formatting?(:start)
|
55
|
+
end
|
56
|
+
|
57
|
+
def include_end_formatting?
|
58
|
+
include_formatting?(:end)
|
59
|
+
end
|
60
|
+
|
61
|
+
def include_isolated_formatting?
|
62
|
+
include_formatting?(:isolated)
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Array<LanguageRule>]
|
66
|
+
def language_rules
|
67
|
+
@language_rules ||=
|
68
|
+
xpath(:srx, :body, :languagerules, :languagerule)
|
69
|
+
.map { |langrule| LanguageRule.new(langrule) }
|
70
|
+
end
|
71
|
+
|
72
|
+
# @return [Array<LanguageMap>]
|
73
|
+
def map_rules
|
74
|
+
@map_rules ||=
|
75
|
+
xpath(:srx, :body, :maprules, :languagemap)
|
76
|
+
.map { |maprule| LanguageMap.new(maprule) }
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def header
|
82
|
+
@header ||= xpath(:srx, :header).first
|
83
|
+
end
|
84
|
+
|
85
|
+
# @param type [Symbol]
|
86
|
+
def format_handle(type)
|
87
|
+
xpath(:srx, :header, "formathandle[@type='#{type}']").first
|
88
|
+
end
|
89
|
+
|
90
|
+
# @param type [Symbol]
|
91
|
+
def include_formatting?(type)
|
92
|
+
elem = format_handle(type)
|
93
|
+
|
94
|
+
return elem['include'] == 'yes' if elem
|
95
|
+
|
96
|
+
# Defaults are
|
97
|
+
# <formathandle type="start" include="no"/>
|
98
|
+
# <formathandle type="end" include="yes"/>
|
99
|
+
# <formathandle type="isolated" include="no"/>
|
100
|
+
case type
|
101
|
+
when %i[start isolated] then false
|
102
|
+
when :end then true
|
103
|
+
else raise(ArgumentError, "Unknown formatting type: #{type}")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# SRX <languagerule> element
|
108
|
+
class LanguageRule < XmlWrapper
|
109
|
+
# @return [String]
|
110
|
+
def name
|
111
|
+
@xml['languagerulename']
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Array<Rule>]
|
115
|
+
def rules
|
116
|
+
@rules ||= xpath(:rule).map { |rule| Rule.new(rule) }
|
117
|
+
end
|
118
|
+
|
119
|
+
# SRX <rule> element
|
120
|
+
class Rule < XmlWrapper
|
121
|
+
# @return [Regexp,nil]
|
122
|
+
attr_reader :before_break
|
123
|
+
|
124
|
+
# @return [Regexp,nil]
|
125
|
+
attr_reader :after_break
|
126
|
+
|
127
|
+
def initialize(xml)
|
128
|
+
super(xml)
|
129
|
+
|
130
|
+
# Eagerly load everything for this class because before_break and
|
131
|
+
# after_break can be legitimately nil, so lazy loading gets ugly.
|
132
|
+
|
133
|
+
@break = @xml['break'].nil? || @xml['break'] == 'yes'
|
134
|
+
|
135
|
+
@before_break = xpath(:beforebreak).first&.text.then do |pattern|
|
136
|
+
IcuRegex.compile(pattern) if pattern
|
137
|
+
end
|
138
|
+
|
139
|
+
@after_break ||= xpath(:afterbreak).first&.text.then do |pattern|
|
140
|
+
IcuRegex.compile(pattern) if pattern
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def break?
|
145
|
+
@break
|
146
|
+
end
|
147
|
+
|
148
|
+
def inspect
|
149
|
+
"Rule[break=#{break?},before=#{before_break},after=#{after_break}]"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# SRX <languagemap> element
|
155
|
+
class LanguageMap < XmlWrapper
|
156
|
+
# @return [String]
|
157
|
+
def language_rule_name
|
158
|
+
@xml['languagerulename']
|
159
|
+
end
|
160
|
+
|
161
|
+
# @return [Regexp]
|
162
|
+
def language_pattern
|
163
|
+
@language_pattern ||= @xml['languagepattern'].then do |pattern|
|
164
|
+
IcuRegex.compile(pattern) if pattern
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/srx/engine.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
# Engine for performing SRX segmenting
|
5
|
+
class Engine
|
6
|
+
# @return [Data]
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
# @param data [Data]
|
10
|
+
# @param markup [Regexp]
|
11
|
+
def initialize(data, format: :text)
|
12
|
+
@data = data
|
13
|
+
@format = Format.get(format)
|
14
|
+
end
|
15
|
+
|
16
|
+
# @param str [String]
|
17
|
+
# @param language [String]
|
18
|
+
# @return [Array<String>]
|
19
|
+
def segment(str, language:)
|
20
|
+
results = []
|
21
|
+
rules = rules(language)
|
22
|
+
|
23
|
+
plain_text, markups = @format.extract_markups(str)
|
24
|
+
|
25
|
+
pos = 0
|
26
|
+
breaks_by_pos(plain_text, rules).each do |break_pos, _|
|
27
|
+
results << build_segment!(plain_text, markups, pos, break_pos)
|
28
|
+
pos = break_pos
|
29
|
+
end
|
30
|
+
|
31
|
+
results
|
32
|
+
end
|
33
|
+
|
34
|
+
# @param language [String]
|
35
|
+
# @return [Array<Data::Rule>]
|
36
|
+
def rules(language)
|
37
|
+
names = rule_names(language)
|
38
|
+
|
39
|
+
rule_map = @data.language_rules.map do |rule|
|
40
|
+
[rule.name, rule]
|
41
|
+
end.to_h
|
42
|
+
|
43
|
+
names.flat_map { |name| rule_map[name].rules }
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param language [String]
|
47
|
+
# @return [Array<String>]
|
48
|
+
def rule_names(language)
|
49
|
+
@data.map_rules.map do |lang_map|
|
50
|
+
next unless lang_map.language_pattern.match?(language)
|
51
|
+
|
52
|
+
break [lang_map.language_rule_name] unless @data.cascade?
|
53
|
+
|
54
|
+
lang_map.language_rule_name
|
55
|
+
end.compact
|
56
|
+
end
|
57
|
+
|
58
|
+
# @param str [String]
|
59
|
+
# @param pos [Integer] the position to start searching from
|
60
|
+
# @param rules [Array<Data::LanguageRule::Rule>]
|
61
|
+
# @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
|
62
|
+
# position of a break, and 2) the rule that matched at that position. Note
|
63
|
+
# that the final break will always be at the end of the string and may not
|
64
|
+
# have an associated rule.
|
65
|
+
def breaks_by_pos(str, rules)
|
66
|
+
rules
|
67
|
+
.flat_map { |rule| all_matches(str, rule) }
|
68
|
+
.group_by(&:first)
|
69
|
+
.transform_values { |pairs| pairs.first.last }
|
70
|
+
.select { |_pos, rule| rule.break? }
|
71
|
+
.sort_by(&:first)
|
72
|
+
.tap { |breaks| breaks << [str.length] unless breaks&.last&.first == str.length }
|
73
|
+
end
|
74
|
+
|
75
|
+
# @param str [String]
|
76
|
+
# @param rule [Data::LanguageRule::Rule]
|
77
|
+
# @return [Array<Array(Integer,Data::LanguageRule::Rule)>]
|
78
|
+
def all_matches(str, rule)
|
79
|
+
results = []
|
80
|
+
|
81
|
+
pos = 0
|
82
|
+
while pos < str.length
|
83
|
+
if rule.before_break
|
84
|
+
m = rule.before_break.match(str, pos)
|
85
|
+
break unless m
|
86
|
+
|
87
|
+
pos = m.end(0)
|
88
|
+
pos += 1 if pos == m.begin(0)
|
89
|
+
|
90
|
+
results << [pos, rule] if rule.after_break.nil? || m.post_match.start_with?(rule.after_break)
|
91
|
+
elsif rule.after_break
|
92
|
+
m = rule.after_break.match(str, pos)
|
93
|
+
break unless m
|
94
|
+
|
95
|
+
pos = m.begin(0) + 1
|
96
|
+
results << [pos, rule]
|
97
|
+
else
|
98
|
+
raise('Rule has neither before_break nor after_break')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
results
|
103
|
+
end
|
104
|
+
|
105
|
+
# @param str [String]
|
106
|
+
# @param markups [Array<Array(Integer,String)>]
|
107
|
+
# @param start [Integer] start offset of segment in str
|
108
|
+
# @param finish [Integer] end offset of segment in str
|
109
|
+
def build_segment!(str, markups, start, finish)
|
110
|
+
segment = str[start...finish]
|
111
|
+
|
112
|
+
until markups.empty?
|
113
|
+
markup_pos, markup = markups.first
|
114
|
+
break unless start + segment.length >= markup_pos
|
115
|
+
|
116
|
+
break if start + segment.length == markup_pos && !include_edge_formatting?(markup)
|
117
|
+
|
118
|
+
segment.insert(markup_pos - start, markup)
|
119
|
+
markups.shift
|
120
|
+
end
|
121
|
+
|
122
|
+
segment
|
123
|
+
end
|
124
|
+
|
125
|
+
# @param markup [String]
|
126
|
+
# @return [Boolean] whether to include the specified edge markup in the
|
127
|
+
# current segment, in accordance with <formathandle> rules
|
128
|
+
def include_edge_formatting?(markup)
|
129
|
+
return false if !@data.include_start_formatting? && @format.start_formatting?(markup)
|
130
|
+
return false if !@data.include_end_formatting? && @format.end_formatting?(markup)
|
131
|
+
return false if !@data.include_isolated_formatting? && @format.isolated_formatting?(markup)
|
132
|
+
|
133
|
+
true
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/srx/format.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'format/base_format'
|
4
|
+
require_relative 'format/text'
|
5
|
+
require_relative 'format/xml'
|
6
|
+
|
7
|
+
module Srx
|
8
|
+
# Format-specific data and logic
|
9
|
+
module Format
|
10
|
+
FORMATS = {
|
11
|
+
text: Text.new,
|
12
|
+
xml: Xml.new,
|
13
|
+
html: Xml.new # TODO: specialize for HTML
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
class << self
|
17
|
+
# @param format [Symbol]
|
18
|
+
# @return [BaseFormat]
|
19
|
+
def get(format)
|
20
|
+
raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
|
21
|
+
|
22
|
+
FORMATS[format]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Srx
|
4
|
+
module Format
|
5
|
+
# Interface definition for format support
|
6
|
+
class BaseFormat
|
7
|
+
# @abstract
|
8
|
+
# @param str [String]
|
9
|
+
# @return [Array(String,Array<Array(Integer,String)>)] two items: 1) input
|
10
|
+
# +str+ with markups removed, and 2) a list of markups, i.e. +[pos,
|
11
|
+
# string]+ pairs
|
12
|
+
def extract_markups(str)
|
13
|
+
raise(NotImplementedError)
|
14
|
+
end
|
15
|
+
|
16
|
+
# @abstract
|
17
|
+
# @param markup [String]
|
18
|
+
# @return [Boolean]
|
19
|
+
def start_formatting?(markup)
|
20
|
+
raise(NotImplementedError)
|
21
|
+
end
|
22
|
+
|
23
|
+
# @abstract
|
24
|
+
# @param markup [String]
|
25
|
+
# @return [Boolean]
|
26
|
+
def end_formatting?(markup)
|
27
|
+
raise(NotImplementedError)
|
28
|
+
end
|
29
|
+
|
30
|
+
# @abstract
|
31
|
+
# @param markup [String]
|
32
|
+
# @return [Boolean]
|
33
|
+
def isolated_formatting?(markup)
|
34
|
+
raise(NotImplementedError)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|