srx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/profile ADDED
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+ require 'optparse'
7
+ require 'memory_profiler'
8
+
9
+ options = {}
10
+ OptionParser.new do |opts|
11
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
12
+
13
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
14
+ opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
15
+ end.parse!(into: options)
16
+
17
+ data = if options[:srx]
18
+ Srx::Data.from_file(path: options[:srx])
19
+ else
20
+ Srx::Data.default
21
+ end
22
+ format = options[:format]&.to_sym || :text
23
+ engine = Srx::Engine.new(data, format: format)
24
+
25
+ license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
26
+
27
+ n = 100
28
+
29
+ report = MemoryProfiler.report do
30
+ n.times { engine.segment(license_text, language: 'en') }
31
+ end
32
+
33
+ report.pretty_print
data/bin/segment ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'srx'
6
+
7
+ options = {}
8
+ OptionParser.new do |opts|
9
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options] [file ...]"
10
+
11
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
12
+ opts.on('-lLANGUAGE', '--language LANGUAGE', 'Language of input text (default: en)')
13
+ end.parse!(into: options)
14
+
15
+ language = options[:language] || 'en'
16
+ data = if options[:srx]
17
+ Srx::Data.from_file(path: options[:srx])
18
+ else
19
+ Srx::Data.default
20
+ end
21
+ engine = Srx::Engine.new(data)
22
+
23
+ begin
24
+ text = ARGF.read
25
+ puts engine.segment(text, language: language)
26
+ rescue Interrupt
27
+ exit(1)
28
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/srx.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'srx/version'
4
+ require_relative 'srx/data'
5
+ require_relative 'srx/engine'
6
+ require_relative 'srx/icu_regex'
7
+ require_relative 'srx/util'
8
+ require_relative 'srx/format'
9
+
10
+ module Srx
11
+ class Error < StandardError; end
12
+ # Your code goes here...
13
+ end
data/lib/srx/data.rb ADDED
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module Srx
6
+ # Abstract convenience wrapper around Nokogiri XML node
7
+ class XmlWrapper
8
+ NS = { 'srx' => 'http://www.lisa.org/srx20' }.freeze
9
+
10
+ def initialize(xml)
11
+ @xml = xml
12
+ end
13
+
14
+ def xpath(*segments)
15
+ path = ['.', *segments].join('/srx:')
16
+ @xml.xpath(path, NS)
17
+ end
18
+ end
19
+
20
+ private_constant :XmlWrapper
21
+
22
+ # SRX data
23
+ class Data < XmlWrapper
24
+ class << self
25
+ # Default SRX rules
26
+ #
27
+ # @return [Data]
28
+ def default
29
+ from_file(path: File.expand_path('srx-20-sample.srx', __dir__))
30
+ end
31
+
32
+ # @param path [String]
33
+ # @return [Data]
34
+ def from_file(path:)
35
+ File.open(path, &method(:from_io))
36
+ end
37
+
38
+ # @param io [IO]
39
+ # @return [Data]
40
+ def from_io(io)
41
+ new(Nokogiri::XML.parse(io))
42
+ end
43
+ end
44
+
45
+ def segment_subflows?
46
+ header['segmentsubflows'] == 'yes'
47
+ end
48
+
49
+ def cascade?
50
+ header['cascade'] == 'yes'
51
+ end
52
+
53
+ def include_start_formatting?
54
+ include_formatting?(:start)
55
+ end
56
+
57
+ def include_end_formatting?
58
+ include_formatting?(:end)
59
+ end
60
+
61
+ def include_isolated_formatting?
62
+ include_formatting?(:isolated)
63
+ end
64
+
65
+ # @return [Array<LanguageRule>]
66
+ def language_rules
67
+ @language_rules ||=
68
+ xpath(:srx, :body, :languagerules, :languagerule)
69
+ .map { |langrule| LanguageRule.new(langrule) }
70
+ end
71
+
72
+ # @return [Array<LanguageMap>]
73
+ def map_rules
74
+ @map_rules ||=
75
+ xpath(:srx, :body, :maprules, :languagemap)
76
+ .map { |maprule| LanguageMap.new(maprule) }
77
+ end
78
+
79
+ private
80
+
81
+ def header
82
+ @header ||= xpath(:srx, :header).first
83
+ end
84
+
85
+ # @param type [Symbol]
86
+ def format_handle(type)
87
+ xpath(:srx, :header, "formathandle[@type='#{type}']").first
88
+ end
89
+
90
+ # @param type [Symbol]
91
+ def include_formatting?(type)
92
+ elem = format_handle(type)
93
+
94
+ return elem['include'] == 'yes' if elem
95
+
96
+ # Defaults are
97
+ # <formathandle type="start" include="no"/>
98
+ # <formathandle type="end" include="yes"/>
99
+ # <formathandle type="isolated" include="no"/>
100
+ case type
101
+ when %i[start isolated] then false
102
+ when :end then true
103
+ else raise(ArgumentError, "Unknown formatting type: #{type}")
104
+ end
105
+ end
106
+
107
+ # SRX <languagerule> element
108
+ class LanguageRule < XmlWrapper
109
+ # @return [String]
110
+ def name
111
+ @xml['languagerulename']
112
+ end
113
+
114
+ # @return [Array<Rule>]
115
+ def rules
116
+ @rules ||= xpath(:rule).map { |rule| Rule.new(rule) }
117
+ end
118
+
119
+ # SRX <rule> element
120
+ class Rule < XmlWrapper
121
+ # @return [Regexp,nil]
122
+ attr_reader :before_break
123
+
124
+ # @return [Regexp,nil]
125
+ attr_reader :after_break
126
+
127
+ def initialize(xml)
128
+ super(xml)
129
+
130
+ # Eagerly load everything for this class because before_break and
131
+ # after_break can be legitimately nil, so lazy loading gets ugly.
132
+
133
+ @break = @xml['break'].nil? || @xml['break'] == 'yes'
134
+
135
+ @before_break = xpath(:beforebreak).first&.text.then do |pattern|
136
+ IcuRegex.compile(pattern) if pattern
137
+ end
138
+
139
+ @after_break ||= xpath(:afterbreak).first&.text.then do |pattern|
140
+ IcuRegex.compile(pattern) if pattern
141
+ end
142
+ end
143
+
144
+ def break?
145
+ @break
146
+ end
147
+
148
+ def inspect
149
+ "Rule[break=#{break?},before=#{before_break},after=#{after_break}]"
150
+ end
151
+ end
152
+ end
153
+
154
+ # SRX <languagemap> element
155
+ class LanguageMap < XmlWrapper
156
+ # @return [String]
157
+ def language_rule_name
158
+ @xml['languagerulename']
159
+ end
160
+
161
+ # @return [Regexp]
162
+ def language_pattern
163
+ @language_pattern ||= @xml['languagepattern'].then do |pattern|
164
+ IcuRegex.compile(pattern) if pattern
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
data/lib/srx/engine.rb ADDED
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ # Engine for performing SRX segmenting
5
+ class Engine
6
+ # @return [Data]
7
+ attr_reader :data
8
+
9
+ # @param data [Data]
10
+ # @param markup [Regexp]
11
+ def initialize(data, format: :text)
12
+ @data = data
13
+ @format = Format.get(format)
14
+ end
15
+
16
+ # @param str [String]
17
+ # @param language [String]
18
+ # @return [Array<String>]
19
+ def segment(str, language:)
20
+ results = []
21
+ rules = rules(language)
22
+
23
+ plain_text, markups = @format.extract_markups(str)
24
+
25
+ pos = 0
26
+ breaks_by_pos(plain_text, rules).each do |break_pos, _|
27
+ results << build_segment!(plain_text, markups, pos, break_pos)
28
+ pos = break_pos
29
+ end
30
+
31
+ results
32
+ end
33
+
34
+ # @param language [String]
35
+ # @return [Array<Data::Rule>]
36
+ def rules(language)
37
+ names = rule_names(language)
38
+
39
+ rule_map = @data.language_rules.map do |rule|
40
+ [rule.name, rule]
41
+ end.to_h
42
+
43
+ names.flat_map { |name| rule_map[name].rules }
44
+ end
45
+
46
+ # @param language [String]
47
+ # @return [Array<String>]
48
+ def rule_names(language)
49
+ @data.map_rules.map do |lang_map|
50
+ next unless lang_map.language_pattern.match?(language)
51
+
52
+ break [lang_map.language_rule_name] unless @data.cascade?
53
+
54
+ lang_map.language_rule_name
55
+ end.compact
56
+ end
57
+
58
+ # @param str [String]
59
+ # @param pos [Integer] the position to start searching from
60
+ # @param rules [Array<Data::LanguageRule::Rule>]
61
+ # @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
62
+ # position of a break, and 2) the rule that matched at that position. Note
63
+ # that the final break will always be at the end of the string and may not
64
+ # have an associated rule.
65
+ def breaks_by_pos(str, rules)
66
+ rules
67
+ .flat_map { |rule| all_matches(str, rule) }
68
+ .group_by(&:first)
69
+ .transform_values { |pairs| pairs.first.last }
70
+ .select { |_pos, rule| rule.break? }
71
+ .sort_by(&:first)
72
+ .tap { |breaks| breaks << [str.length] unless breaks&.last&.first == str.length }
73
+ end
74
+
75
+ # @param str [String]
76
+ # @param rule [Data::LanguageRule::Rule]
77
+ # @return [Array<Array(Integer,Data::LanguageRule::Rule)>]
78
+ def all_matches(str, rule)
79
+ results = []
80
+
81
+ pos = 0
82
+ while pos < str.length
83
+ if rule.before_break
84
+ m = rule.before_break.match(str, pos)
85
+ break unless m
86
+
87
+ pos = m.end(0)
88
+ pos += 1 if pos == m.begin(0)
89
+
90
+ results << [pos, rule] if rule.after_break.nil? || m.post_match.start_with?(rule.after_break)
91
+ elsif rule.after_break
92
+ m = rule.after_break.match(str, pos)
93
+ break unless m
94
+
95
+ pos = m.begin(0) + 1
96
+ results << [pos, rule]
97
+ else
98
+ raise('Rule has neither before_break nor after_break')
99
+ end
100
+ end
101
+
102
+ results
103
+ end
104
+
105
+ # @param str [String]
106
+ # @param markups [Array<Array(Integer,String)>]
107
+ # @param start [Integer] start offset of segment in str
108
+ # @param finish [Integer] end offset of segment in str
109
+ def build_segment!(str, markups, start, finish)
110
+ segment = str[start...finish]
111
+
112
+ until markups.empty?
113
+ markup_pos, markup = markups.first
114
+ break unless start + segment.length >= markup_pos
115
+
116
+ break if start + segment.length == markup_pos && !include_edge_formatting?(markup)
117
+
118
+ segment.insert(markup_pos - start, markup)
119
+ markups.shift
120
+ end
121
+
122
+ segment
123
+ end
124
+
125
+ # @param markup [String]
126
+ # @return [Boolean] whether to include the specified edge markup in the
127
+ # current segment, in accordance with <formathandle> rules
128
+ def include_edge_formatting?(markup)
129
+ return false if !@data.include_start_formatting? && @format.start_formatting?(markup)
130
+ return false if !@data.include_end_formatting? && @format.end_formatting?(markup)
131
+ return false if !@data.include_isolated_formatting? && @format.isolated_formatting?(markup)
132
+
133
+ true
134
+ end
135
+ end
136
+ end
data/lib/srx/format.rb ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'format/base_format'
4
+ require_relative 'format/text'
5
+ require_relative 'format/xml'
6
+
7
+ module Srx
8
+ # Format-specific data and logic
9
+ module Format
10
+ FORMATS = {
11
+ text: Text.new,
12
+ xml: Xml.new,
13
+ html: Xml.new # TODO: specialize for HTML
14
+ }.freeze
15
+
16
+ class << self
17
+ # @param format [Symbol]
18
+ # @return [BaseFormat]
19
+ def get(format)
20
+ raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
21
+
22
+ FORMATS[format]
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Srx
4
+ module Format
5
+ # Interface definition for format support
6
+ class BaseFormat
7
+ # @abstract
8
+ # @param str [String]
9
+ # @return [Array(String,Array<Array(Integer,String)>)] two items: 1) input
10
+ # +str+ with markups removed, and 2) a list of markups, i.e. +[pos,
11
+ # string]+ pairs
12
+ def extract_markups(str)
13
+ raise(NotImplementedError)
14
+ end
15
+
16
+ # @abstract
17
+ # @param markup [String]
18
+ # @return [Boolean]
19
+ def start_formatting?(markup)
20
+ raise(NotImplementedError)
21
+ end
22
+
23
+ # @abstract
24
+ # @param markup [String]
25
+ # @return [Boolean]
26
+ def end_formatting?(markup)
27
+ raise(NotImplementedError)
28
+ end
29
+
30
+ # @abstract
31
+ # @param markup [String]
32
+ # @return [Boolean]
33
+ def isolated_formatting?(markup)
34
+ raise(NotImplementedError)
35
+ end
36
+ end
37
+ end
38
+ end