srx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/profile ADDED
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+ require 'optparse'
7
+ require 'memory_profiler'
8
+
9
+ options = {}
10
+ OptionParser.new do |opts|
11
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
12
+
13
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
14
+ opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
15
+ end.parse!(into: options)
16
+
17
+ data = if options[:srx]
18
+ Srx::Data.from_file(path: options[:srx])
19
+ else
20
+ Srx::Data.default
21
+ end
22
+ format = options[:format]&.to_sym || :text
23
+ engine = Srx::Engine.new(data, format: format)
24
+
25
+ license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
26
+
27
+ n = 100
28
+
29
+ report = MemoryProfiler.report do
30
+ n.times { engine.segment(license_text, language: 'en') }
31
+ end
32
+
33
+ report.pretty_print
data/bin/segment ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'srx'
6
+
7
+ options = {}
8
+ OptionParser.new do |opts|
9
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options] [file ...]"
10
+
11
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
12
+ opts.on('-lLANGUAGE', '--language LANGUAGE', 'Language of input text (default: en)')
13
+ end.parse!(into: options)
14
+
15
+ language = options[:language] || 'en'
16
+ data = if options[:srx]
17
+ Srx::Data.from_file(path: options[:srx])
18
+ else
19
+ Srx::Data.default
20
+ end
21
+ engine = Srx::Engine.new(data)
22
+
23
+ begin
24
+ text = ARGF.read
25
+ puts engine.segment(text, language: language)
26
+ rescue Interrupt
27
+ exit(1)
28
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/srx.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'srx/version'
4
+ require_relative 'srx/data'
5
+ require_relative 'srx/engine'
6
+ require_relative 'srx/icu_regex'
7
+ require_relative 'srx/util'
8
+ require_relative 'srx/format'
9
+
10
+ module Srx
11
+ class Error < StandardError; end
12
+ # Your code goes here...
13
+ end
data/lib/srx/data.rb ADDED
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module Srx
6
+ # Abstract convenience wrapper around Nokogiri XML node
7
+ class XmlWrapper
8
+ NS = { 'srx' => 'http://www.lisa.org/srx20' }.freeze
9
+
10
+ def initialize(xml)
11
+ @xml = xml
12
+ end
13
+
14
+ def xpath(*segments)
15
+ path = ['.', *segments].join('/srx:')
16
+ @xml.xpath(path, NS)
17
+ end
18
+ end
19
+
20
+ private_constant :XmlWrapper
21
+
22
+ # SRX data
23
+ class Data < XmlWrapper
24
+ class << self
25
+ # Default SRX rules
26
+ #
27
+ # @return [Data]
28
+ def default
29
+ from_file(path: File.expand_path('srx-20-sample.srx', __dir__))
30
+ end
31
+
32
+ # @param path [String]
33
+ # @return [Data]
34
+ def from_file(path:)
35
+ File.open(path, &method(:from_io))
36
+ end
37
+
38
+ # @param io [IO]
39
+ # @return [Data]
40
+ def from_io(io)
41
+ new(Nokogiri::XML.parse(io))
42
+ end
43
+ end
44
+
45
+ def segment_subflows?
46
+ header['segmentsubflows'] == 'yes'
47
+ end
48
+
49
+ def cascade?
50
+ header['cascade'] == 'yes'
51
+ end
52
+
53
+ def include_start_formatting?
54
+ include_formatting?(:start)
55
+ end
56
+
57
+ def include_end_formatting?
58
+ include_formatting?(:end)
59
+ end
60
+
61
+ def include_isolated_formatting?
62
+ include_formatting?(:isolated)
63
+ end
64
+
65
+ # @return [Array<LanguageRule>]
66
+ def language_rules
67
+ @language_rules ||=
68
+ xpath(:srx, :body, :languagerules, :languagerule)
69
+ .map { |langrule| LanguageRule.new(langrule) }
70
+ end
71
+
72
+ # @return [Array<LanguageMap>]
73
+ def map_rules
74
+ @map_rules ||=
75
+ xpath(:srx, :body, :maprules, :languagemap)
76
+ .map { |maprule| LanguageMap.new(maprule) }
77
+ end
78
+
79
+ private
80
+
81
+ def header
82
+ @header ||= xpath(:srx, :header).first
83
+ end
84
+
85
+ # @param type [Symbol]
86
+ def format_handle(type)
87
+ xpath(:srx, :header, "formathandle[@type='#{type}']").first
88
+ end
89
+
90
+ # @param type [Symbol]
91
+ def include_formatting?(type)
92
+ elem = format_handle(type)
93
+
94
+ return elem['include'] == 'yes' if elem
95
+
96
+ # Defaults are
97
+ # <formathandle type="start" include="no"/>
98
+ # <formathandle type="end" include="yes"/>
99
+ # <formathandle type="isolated" include="no"/>
100
+ case type
101
+ when %i[start isolated] then false
102
+ when :end then true
103
+ else raise(ArgumentError, "Unknown formatting type: #{type}")
104
+ end
105
+ end
106
+
107
+ # SRX <languagerule> element
108
+ class LanguageRule < XmlWrapper
109
+ # @return [String]
110
+ def name
111
+ @xml['languagerulename']
112
+ end
113
+
114
+ # @return [Array<Rule>]
115
+ def rules
116
+ @rules ||= xpath(:rule).map { |rule| Rule.new(rule) }
117
+ end
118
+
119
+ # SRX <rule> element
120
+ class Rule < XmlWrapper
121
+ # @return [Regexp,nil]
122
+ attr_reader :before_break
123
+
124
+ # @return [Regexp,nil]
125
+ attr_reader :after_break
126
+
127
+ def initialize(xml)
128
+ super(xml)
129
+
130
+ # Eagerly load everything for this class because before_break and
131
+ # after_break can be legitimately nil, so lazy loading gets ugly.
132
+
133
+ @break = @xml['break'].nil? || @xml['break'] == 'yes'
134
+
135
+ @before_break = xpath(:beforebreak).first&.text.then do |pattern|
136
+ IcuRegex.compile(pattern) if pattern
137
+ end
138
+
139
+ @after_break ||= xpath(:afterbreak).first&.text.then do |pattern|
140
+ IcuRegex.compile(pattern) if pattern
141
+ end
142
+ end
143
+
144
+ def break?
145
+ @break
146
+ end
147
+
148
+ def inspect
149
+ "Rule[break=#{break?},before=#{before_break},after=#{after_break}]"
150
+ end
151
+ end
152
+ end
153
+
154
+ # SRX <languagemap> element
155
+ class LanguageMap < XmlWrapper
156
+ # @return [String]
157
+ def language_rule_name
158
+ @xml['languagerulename']
159
+ end
160
+
161
+ # @return [Regexp]
162
+ def language_pattern
163
+ @language_pattern ||= @xml['languagepattern'].then do |pattern|
164
+ IcuRegex.compile(pattern) if pattern
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
data/lib/srx/engine.rb ADDED
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Engine for performing SRX segmenting
5
+ class Engine
6
+ # @return [Data]
7
+ attr_reader :data
8
+
9
+ # @param data [Data]
10
+ # @param markup [Regexp]
11
+ def initialize(data, format: :text)
12
+ @data = data
13
+ @format = Format.get(format)
14
+ end
15
+
16
+ # @param str [String]
17
+ # @param language [String]
18
+ # @return [Array<String>]
19
+ def segment(str, language:)
20
+ results = []
21
+ rules = rules(language)
22
+
23
+ plain_text, markups = @format.extract_markups(str)
24
+
25
+ pos = 0
26
+ breaks_by_pos(plain_text, rules).each do |break_pos, _|
27
+ results << build_segment!(plain_text, markups, pos, break_pos)
28
+ pos = break_pos
29
+ end
30
+
31
+ results
32
+ end
33
+
34
+ # @param language [String]
35
+ # @return [Array<Data::Rule>]
36
+ def rules(language)
37
+ names = rule_names(language)
38
+
39
+ rule_map = @data.language_rules.map do |rule|
40
+ [rule.name, rule]
41
+ end.to_h
42
+
43
+ names.flat_map { |name| rule_map[name].rules }
44
+ end
45
+
46
+ # @param language [String]
47
+ # @return [Array<String>]
48
+ def rule_names(language)
49
+ @data.map_rules.map do |lang_map|
50
+ next unless lang_map.language_pattern.match?(language)
51
+
52
+ break [lang_map.language_rule_name] unless @data.cascade?
53
+
54
+ lang_map.language_rule_name
55
+ end.compact
56
+ end
57
+
58
+ # @param str [String]
59
+ # @param pos [Integer] the position to start searching from
60
+ # @param rules [Array<Data::LanguageRule::Rule>]
61
+ # @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
62
+ # position of a break, and 2) the rule that matched at that position. Note
63
+ # that the final break will always be at the end of the string and may not
64
+ # have an associated rule.
65
+ def breaks_by_pos(str, rules)
66
+ rules
67
+ .flat_map { |rule| all_matches(str, rule) }
68
+ .group_by(&:first)
69
+ .transform_values { |pairs| pairs.first.last }
70
+ .select { |_pos, rule| rule.break? }
71
+ .sort_by(&:first)
72
+ .tap { |breaks| breaks << [str.length] unless breaks&.last&.first == str.length }
73
+ end
74
+
75
+ # @param str [String]
76
+ # @param rule [Data::LanguageRule::Rule]
77
+ # @return [Array<Array(Integer,Data::LanguageRule::Rule)>]
78
+ def all_matches(str, rule)
79
+ results = []
80
+
81
+ pos = 0
82
+ while pos < str.length
83
+ if rule.before_break
84
+ m = rule.before_break.match(str, pos)
85
+ break unless m
86
+
87
+ pos = m.end(0)
88
+ pos += 1 if pos == m.begin(0)
89
+
90
+ results << [pos, rule] if rule.after_break.nil? || m.post_match.start_with?(rule.after_break)
91
+ elsif rule.after_break
92
+ m = rule.after_break.match(str, pos)
93
+ break unless m
94
+
95
+ pos = m.begin(0) + 1
96
+ results << [pos, rule]
97
+ else
98
+ raise('Rule has neither before_break nor after_break')
99
+ end
100
+ end
101
+
102
+ results
103
+ end
104
+
105
+ # @param str [String]
106
+ # @param markups [Array<Array(Integer,String)>]
107
+ # @param start [Integer] start offset of segment in str
108
+ # @param finish [Integer] end offset of segment in str
109
+ def build_segment!(str, markups, start, finish)
110
+ segment = str[start...finish]
111
+
112
+ until markups.empty?
113
+ markup_pos, markup = markups.first
114
+ break unless start + segment.length >= markup_pos
115
+
116
+ break if start + segment.length == markup_pos && !include_edge_formatting?(markup)
117
+
118
+ segment.insert(markup_pos - start, markup)
119
+ markups.shift
120
+ end
121
+
122
+ segment
123
+ end
124
+
125
+ # @param markup [String]
126
+ # @return [Boolean] whether to include the specified edge markup in the
127
+ # current segment, in accordance with <formathandle> rules
128
+ def include_edge_formatting?(markup)
129
+ return false if !@data.include_start_formatting? && @format.start_formatting?(markup)
130
+ return false if !@data.include_end_formatting? && @format.end_formatting?(markup)
131
+ return false if !@data.include_isolated_formatting? && @format.isolated_formatting?(markup)
132
+
133
+ true
134
+ end
135
+ end
136
+ end
data/lib/srx/format.rb ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'format/base_format'
4
+ require_relative 'format/text'
5
+ require_relative 'format/xml'
6
+
7
+ module Srx
8
+ # Format-specific data and logic
9
+ module Format
10
+ FORMATS = {
11
+ text: Text.new,
12
+ xml: Xml.new,
13
+ html: Xml.new # TODO: specialize for HTML
14
+ }.freeze
15
+
16
+ class << self
17
+ # @param format [Symbol]
18
+ # @return [BaseFormat]
19
+ def get(format)
20
+ raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
21
+
22
+ FORMATS[format]
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ module Format
5
+ # Interface definition for format support
6
+ class BaseFormat
7
+ # @abstract
8
+ # @param str [String]
9
+ # @return [Array(String,Array<Array(Integer,String)>)] two items: 1) input
10
+ # +str+ with markups removed, and 2) a list of markups, i.e. +[pos,
11
+ # string]+ pairs
12
+ def extract_markups(str)
13
+ raise(NotImplementedError)
14
+ end
15
+
16
+ # @abstract
17
+ # @param markup [String]
18
+ # @return [Boolean]
19
+ def start_formatting?(markup)
20
+ raise(NotImplementedError)
21
+ end
22
+
23
+ # @abstract
24
+ # @param markup [String]
25
+ # @return [Boolean]
26
+ def end_formatting?(markup)
27
+ raise(NotImplementedError)
28
+ end
29
+
30
+ # @abstract
31
+ # @param markup [String]
32
+ # @return [Boolean]
33
+ def isolated_formatting?(markup)
34
+ raise(NotImplementedError)
35
+ end
36
+ end
37
+ end
38
+ end