text-hyphen 1.4.1 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.standard.yml +5 -0
- data/Code-of-Conduct.md +73 -0
- data/Contributing.md +68 -0
- data/History.md +139 -0
- data/Licence.md +159 -0
- data/Manifest.txt +12 -5
- data/README.md +81 -0
- data/Rakefile +68 -19
- data/bin/ruby-hyphen +0 -0
- data/lib/text/hyphen/language/1.8/de1.rb +1307 -571
- data/lib/text/hyphen/language/1.8/en_us.rb +412 -453
- data/lib/text/hyphen/language/1.8/fr.rb +128 -334
- data/lib/text/hyphen/language/1.8/la.rb +1 -0
- data/lib/text/hyphen/language/1.8/lt.rb +115 -0
- data/lib/text/hyphen/language/1.8/pt.rb +2 -1
- data/lib/text/hyphen/language/1.8/sk.rb +277 -0
- data/lib/text/hyphen/language/1.9/ca.rb +2 -1
- data/lib/text/hyphen/language/1.9/cs.rb +2 -1
- data/lib/text/hyphen/language/1.9/da.rb +2 -1
- data/lib/text/hyphen/language/1.9/de1.rb +1382 -646
- data/lib/text/hyphen/language/1.9/de2.rb +110 -109
- data/lib/text/hyphen/language/1.9/en_uk.rb +2 -1
- data/lib/text/hyphen/language/1.9/en_us.rb +412 -454
- data/lib/text/hyphen/language/1.9/es.rb +2 -1
- data/lib/text/hyphen/language/1.9/et.rb +6 -5
- data/lib/text/hyphen/language/1.9/eu.rb +4 -3
- data/lib/text/hyphen/language/1.9/fi.rb +3 -2
- data/lib/text/hyphen/language/1.9/fr.rb +136 -343
- data/lib/text/hyphen/language/1.9/ga.rb +27 -26
- data/lib/text/hyphen/language/1.9/hr.rb +6 -5
- data/lib/text/hyphen/language/1.9/hsb.rb +3 -2
- data/lib/text/hyphen/language/1.9/hu1.rb +3 -2
- data/lib/text/hyphen/language/1.9/hu2.rb +5 -4
- data/lib/text/hyphen/language/1.9/ia.rb +2 -1
- data/lib/text/hyphen/language/1.9/id.rb +8 -7
- data/lib/text/hyphen/language/1.9/is.rb +2 -1
- data/lib/text/hyphen/language/1.9/it.rb +74 -74
- data/lib/text/hyphen/language/1.9/la.rb +54 -53
- data/lib/text/hyphen/language/1.9/lt.rb +116 -0
- data/lib/text/hyphen/language/1.9/mn.rb +7 -6
- data/lib/text/hyphen/language/1.9/nl.rb +2 -1
- data/lib/text/hyphen/language/1.9/no1.rb +3 -2
- data/lib/text/hyphen/language/1.9/no2.rb +3 -2
- data/lib/text/hyphen/language/1.9/pl.rb +2 -1
- data/lib/text/hyphen/language/1.9/pt.rb +3 -2
- data/lib/text/hyphen/language/1.9/ru.rb +2 -1
- data/lib/text/hyphen/language/1.9/sk.rb +280 -0
- data/lib/text/hyphen/language/1.9/sv.rb +4 -3
- data/lib/text/hyphen/language/cs.rb +1 -1
- data/lib/text/hyphen/language/de.rb +2 -1
- data/lib/text/hyphen/language/de1.rb +1 -1
- data/lib/text/hyphen/language/de2.rb +1 -1
- data/lib/text/hyphen/language/en_us.rb +1 -1
- data/lib/text/hyphen/language/eu.rb +1 -1
- data/lib/text/hyphen/language/fr.rb +1 -1
- data/lib/text/hyphen/language/hu.rb +1 -1
- data/lib/text/hyphen/language/hu1.rb +1 -1
- data/lib/text/hyphen/language/hu2.rb +1 -1
- data/lib/text/hyphen/language/is.rb +1 -1
- data/lib/text/hyphen/language/lt.rb +4 -0
- data/lib/text/hyphen/language/ms.rb +3 -3
- data/lib/text/hyphen/language/nl.rb +1 -1
- data/lib/text/hyphen/language/no.rb +1 -1
- data/lib/text/hyphen/language/sk.rb +4 -0
- data/lib/text/hyphen/language.rb +45 -45
- data/lib/text/hyphen.rb +139 -97
- data/lib/text-hyphen.rb +1 -1
- data/test/data/bug_9807_latin1.rb +2 -2
- data/test/data/bug_9807_utf-8.rb +1 -1
- data/test/test_bugs.rb +14 -13
- data/test/test_text_hyphen.rb +31 -21
- metadata +146 -96
- data/.autotest +0 -23
- data/.gemtest +0 -0
- data/History.rdoc +0 -99
- data/License.rdoc +0 -159
- data/README.rdoc +0 -95
- data/text-hyphen.gemspec +0 -51
@@ -1,9 +1,9 @@
|
|
1
|
-
require
|
1
|
+
require "text/hyphen/language/id"
|
2
2
|
|
3
3
|
unless defined? Text::Hyphen::Language::MS
|
4
4
|
Text::Hyphen::Language::MS = Text::Hyphen::Language.new(Text::Hyphen::Language::ID) do |malay|
|
5
|
-
malay.isocode =
|
5
|
+
malay.isocode = "ms"
|
6
6
|
end
|
7
7
|
|
8
|
-
Text::Hyphen::Language.aliases_for "MS" => %W
|
8
|
+
Text::Hyphen::Language.aliases_for "MS" => %W[MAY MSA]
|
9
9
|
end
|
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require "text/hyphen/language/no1"
|
data/lib/text/hyphen/language.rb
CHANGED
@@ -4,32 +4,32 @@
|
|
4
4
|
# patterns are defined as instances of this class—and only this class. This
|
5
5
|
# is a deliberate "breaking" of Ruby's concept of duck-typing and is
|
6
6
|
# intended to provide an indication that the patterns have been converted
|
7
|
-
# from TeX encodings to other encodings (e.g.,
|
7
|
+
# from TeX encodings to other encodings (e.g., iso-8859-1 or UTF-8) that are
|
8
8
|
# more suitable to general text manipulations.
|
9
9
|
class Text::Hyphen::Language
|
10
|
-
WORD_START_RE
|
11
|
-
WORD_END_RE
|
12
|
-
DIGIT_RE
|
13
|
-
NONDIGIT_RE
|
14
|
-
DASH_RE
|
15
|
-
EXCEPTION_DASH0_RE
|
16
|
-
EXCEPTION_DASH1_RE
|
17
|
-
EXCEPTION_NONUM_RE
|
18
|
-
ZERO_INSERT_RE
|
19
|
-
ZERO_START_RE
|
20
|
-
|
21
|
-
DEFAULT_ENCODING
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
10
|
+
WORD_START_RE = %r{^\.} # :nodoc:
|
11
|
+
WORD_END_RE = %r{\.$} # :nodoc:
|
12
|
+
DIGIT_RE = %r{\d} # :nodoc:
|
13
|
+
NONDIGIT_RE = %r{\D} # :nodoc:
|
14
|
+
DASH_RE = %r{-} # :nodoc:
|
15
|
+
EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} # :nodoc:
|
16
|
+
EXCEPTION_DASH1_RE = %r{[^-]-} # :nodoc:
|
17
|
+
EXCEPTION_NONUM_RE = %r{[^01]} # :nodoc:
|
18
|
+
ZERO_INSERT_RE = %r{(\D)(?=\D)} # :nodoc:
|
19
|
+
ZERO_START_RE = %r{^(?=\D)} # :nodoc:
|
20
|
+
|
21
|
+
DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" # :nodoc:
|
22
|
+
"iso-8859-1"
|
23
|
+
else
|
24
|
+
"utf-8"
|
25
|
+
end
|
26
26
|
|
27
27
|
# The character scan regular expression to use.
|
28
|
-
def scan_re
|
29
|
-
if RUBY_VERSION <
|
28
|
+
def scan_re # :nodoc:
|
29
|
+
if RUBY_VERSION < "1.9.1"
|
30
30
|
return %r{.}u if @encoding =~ /utf-?8/i
|
31
31
|
end
|
32
|
-
|
32
|
+
%r{.}
|
33
33
|
end
|
34
34
|
|
35
35
|
# The encoding of the hyphenation definitions. The text to be compared
|
@@ -66,21 +66,21 @@ class Text::Hyphen::Language
|
|
66
66
|
@pattern_text = pats.dup
|
67
67
|
|
68
68
|
@patterns = {
|
69
|
-
:both
|
70
|
-
:start
|
71
|
-
:stop
|
69
|
+
:both => {},
|
70
|
+
:start => {},
|
71
|
+
:stop => {},
|
72
72
|
:hyphen => {}
|
73
73
|
}
|
74
74
|
|
75
|
-
plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$},
|
75
|
+
plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, "") }
|
76
76
|
plist.each do |line|
|
77
77
|
line.split.each do |word|
|
78
78
|
next if word.empty?
|
79
79
|
|
80
80
|
start = stop = false
|
81
81
|
|
82
|
-
start = true if word.sub!(WORD_START_RE,
|
83
|
-
stop
|
82
|
+
start = true if word.sub!(WORD_START_RE, "")
|
83
|
+
stop = true if word.sub!(WORD_END_RE, "")
|
84
84
|
|
85
85
|
# Insert zeroes and start with some digit
|
86
86
|
word.gsub!(ZERO_INSERT_RE) { "#{$1}0" }
|
@@ -88,17 +88,17 @@ class Text::Hyphen::Language
|
|
88
88
|
|
89
89
|
# This assumes that the pattern lists are already in lowercase
|
90
90
|
# form only.
|
91
|
-
tag
|
92
|
-
value = word.gsub(NONDIGIT_RE,
|
91
|
+
tag = word.gsub(DIGIT_RE, "")
|
92
|
+
value = word.gsub(NONDIGIT_RE, "")
|
93
93
|
|
94
|
-
if start
|
95
|
-
|
94
|
+
set = if start && stop
|
95
|
+
:both
|
96
96
|
elsif start
|
97
|
-
|
97
|
+
:start
|
98
98
|
elsif stop
|
99
|
-
|
99
|
+
:stop
|
100
100
|
else
|
101
|
-
|
101
|
+
:hyphen
|
102
102
|
end
|
103
103
|
|
104
104
|
@patterns[set][tag] = value
|
@@ -116,10 +116,10 @@ class Text::Hyphen::Language
|
|
116
116
|
@exceptions = {}
|
117
117
|
|
118
118
|
@exception_text.split.each do |word|
|
119
|
-
tag
|
120
|
-
value = "0" + word.gsub(EXCEPTION_DASH0_RE,
|
121
|
-
value.gsub!(EXCEPTION_NONUM_RE,
|
122
|
-
@exceptions[tag] = value.scan(
|
119
|
+
tag = word.gsub(DASH_RE, "")
|
120
|
+
value = "0" + word.gsub(EXCEPTION_DASH0_RE, "0").gsub(EXCEPTION_DASH1_RE, "1")
|
121
|
+
value.gsub!(EXCEPTION_NONUM_RE, "0")
|
122
|
+
@exceptions[tag] = value.scan(scan_re).map { |c| c.to_i }
|
123
123
|
end
|
124
124
|
|
125
125
|
true
|
@@ -142,16 +142,16 @@ class Text::Hyphen::Language
|
|
142
142
|
# instance of Text::Hyphen::Language.
|
143
143
|
def initialize(language = nil)
|
144
144
|
if language.nil?
|
145
|
-
|
146
|
-
|
147
|
-
|
145
|
+
encoding DEFAULT_ENCODING
|
146
|
+
patterns ""
|
147
|
+
exceptions ""
|
148
148
|
self.left = 2
|
149
149
|
self.right = 2
|
150
150
|
self.isocode = nil
|
151
|
-
elsif language.
|
152
|
-
|
153
|
-
|
154
|
-
|
151
|
+
elsif language.is_a? Text::Hyphen::Language
|
152
|
+
encoding language.encoding
|
153
|
+
patterns language.instance_variable_get(:@pattern_text)
|
154
|
+
exceptions language.instance_variable_get(:@exception_text)
|
155
155
|
self.left = language.left
|
156
156
|
self.right = language.right
|
157
157
|
self.isocode = language.isocode
|
@@ -171,7 +171,7 @@ class Text::Hyphen::Language
|
|
171
171
|
end
|
172
172
|
language = const_get(language)
|
173
173
|
|
174
|
-
[
|
174
|
+
[alias_names].flatten.each do |alias_name|
|
175
175
|
next if const_defined? alias_name
|
176
176
|
const_set(alias_name, language)
|
177
177
|
end
|
data/lib/text/hyphen.rb
CHANGED
@@ -7,10 +7,21 @@ end
|
|
7
7
|
# hyphenation algorithm with pattern files. Each object is constructed with
|
8
8
|
# a specific language's hyphenation patterns.
|
9
9
|
class Text::Hyphen
|
10
|
-
|
11
|
-
|
10
|
+
# Resolves a file for cleaner loading from a hyphenation loader file.
|
11
|
+
def self.require_real_hyphenation_file(loader) # :nodoc:
|
12
|
+
p = File.dirname(loader)
|
13
|
+
f = File.basename(loader)
|
14
|
+
v = if RUBY_VERSION < "1.9.1"
|
15
|
+
"1.8"
|
16
|
+
else
|
17
|
+
"1.9"
|
18
|
+
end
|
19
|
+
require File.join(p, v, f)
|
20
|
+
end
|
12
21
|
|
13
|
-
|
22
|
+
VERSION = "1.5.0"
|
23
|
+
|
24
|
+
DEFAULT_MIN_LEFT = 2
|
14
25
|
DEFAULT_MIN_RIGHT = 2
|
15
26
|
|
16
27
|
# No fewer than this number of letters will show up to the left of the
|
@@ -26,31 +37,31 @@ class Text::Hyphen
|
|
26
37
|
# two or three character ISO 639 code, with the two character form being
|
27
38
|
# the canonical resource name. This will load the language hyphenation
|
28
39
|
# definitions from text/hyphen/language/<code> as a Ruby class. The
|
29
|
-
# resource
|
40
|
+
# resource "text/hyphen/language/en_us" defines the language class
|
30
41
|
# Text::Hyphen::Language::EN_US. It also defines the secondary forms
|
31
42
|
# Text::Hyphen::Language::EN and Text::Hyphen::Language::ENG_US.
|
32
43
|
#
|
33
44
|
# Minimal transformations will be performed on the language code provided,
|
34
|
-
# such that any dashes are converted to underscores (e.g.,
|
35
|
-
#
|
36
|
-
# downcased and class names will be converted to uppercase (e.g.,
|
37
|
-
# the Portuguese language becomes
|
45
|
+
# such that any dashes are converted to underscores (e.g., "en-us" becomes
|
46
|
+
# "en_us") and all characters are regularised. Resource names will be
|
47
|
+
# downcased and class names will be converted to uppercase (e.g., "Pt" for
|
48
|
+
# the Portuguese language becomes "pt" and "PT", respectively).
|
38
49
|
#
|
39
50
|
# The language may also be specified as an instance of
|
40
51
|
# Text::Hyphen::Language.
|
41
|
-
|
52
|
+
#
|
53
|
+
# :attr_accessor: language
|
54
|
+
attr_reader :language
|
42
55
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
if lang.kind_of? Text::Hyphen::Language
|
56
|
+
def language=(lang) # :nodoc:
|
57
|
+
require "text/hyphen/language" unless defined?(Text::Hyphen::Language)
|
58
|
+
if lang.is_a? Text::Hyphen::Language
|
47
59
|
@iso_language = lang.to_s.split(%r{::}o)[-1].downcase
|
48
|
-
@language
|
60
|
+
@language = lang
|
49
61
|
else
|
50
62
|
@iso_language = lang.downcase
|
51
63
|
load_language
|
52
64
|
end
|
53
|
-
@iso_language
|
54
65
|
end
|
55
66
|
|
56
67
|
# Returns the language's ISO 639 ID, e.g., "en_us" or "pt".
|
@@ -70,23 +81,22 @@ class Text::Hyphen
|
|
70
81
|
# methods in an initialization block. The following initializations are
|
71
82
|
# all equivalent:
|
72
83
|
#
|
73
|
-
# hyp = Text::Hyphenate.new(:
|
74
|
-
# hyp = Text::Hyphenate.new
|
75
|
-
# hyp = Text::Hyphenate.new { |h| h.language = 'en_us' }
|
84
|
+
# hyp = Text::Hyphenate.new(language: "en_us")
|
85
|
+
# hyp = Text::Hyphenate.new { |h| h.language = "en_us" }
|
76
86
|
def initialize(options = {}) # :yields self:
|
77
87
|
@iso_language = options[:language]
|
78
|
-
@left
|
79
|
-
@right
|
80
|
-
@language
|
88
|
+
@left = options[:left]
|
89
|
+
@right = options[:right]
|
90
|
+
@language = nil
|
81
91
|
|
82
|
-
@cache
|
83
|
-
@vcache
|
92
|
+
@cache = {}
|
93
|
+
@vcache = {}
|
84
94
|
|
85
|
-
@hyphen
|
95
|
+
@hyphen = {}
|
86
96
|
@begin_hyphen = {}
|
87
|
-
@end_hyphen
|
88
|
-
@both_hyphen
|
89
|
-
@exception
|
97
|
+
@end_hyphen = {}
|
98
|
+
@both_hyphen = {}
|
99
|
+
@exception = {}
|
90
100
|
|
91
101
|
@first_load = true
|
92
102
|
yield self if block_given?
|
@@ -94,57 +104,87 @@ class Text::Hyphen
|
|
94
104
|
|
95
105
|
load_language
|
96
106
|
|
97
|
-
@left
|
107
|
+
@left ||= DEFAULT_MIN_LEFT
|
98
108
|
@right ||= DEFAULT_MIN_RIGHT
|
99
109
|
end
|
100
110
|
|
101
111
|
# Returns an array of character positions where a word can be hyphenated.
|
102
112
|
#
|
103
|
-
# hyp.hyphenate(
|
113
|
+
# hyp.hyphenate("representation") #=> [3, 5, 8 10]
|
104
114
|
#
|
105
115
|
# Because hyphenation can be expensive, if the word has been hyphenated
|
106
116
|
# previously, it will be returned from a per-instance cache.
|
117
|
+
#
|
118
|
+
# #hyphenate supports phrase hyphenation:
|
119
|
+
#
|
120
|
+
# hyp.hyphenate("This useful library supports phrases and sentences.")
|
121
|
+
# #=> [8, 14, 23, 27, 34, 44]
|
122
|
+
#
|
123
|
+
# When phrases are hyphenated, each word is processed individually and the
|
124
|
+
# result is returned as a single continuous list of hyphenation points.
|
107
125
|
def hyphenate(word)
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
126
|
+
words = if phrase?(word)
|
127
|
+
word.downcase.split(/[[:space:]]/)
|
128
|
+
else
|
129
|
+
[word.downcase]
|
130
|
+
end
|
131
|
+
|
132
|
+
points = words.map do |word|
|
133
|
+
next @cache[word] if @cache.has_key?(word)
|
134
|
+
|
135
|
+
if (exception = @language.exceptions[word])
|
136
|
+
next @cache[word] = make_result_list(exception)
|
137
|
+
end
|
138
|
+
|
139
|
+
letters = word.scan(@language.scan_re)
|
140
|
+
word_size = letters.size
|
141
|
+
|
142
|
+
result = [0] * (word_size + 1)
|
143
|
+
right_stop = word_size - @right
|
144
|
+
|
145
|
+
updater = proc do |hash, str, pos|
|
146
|
+
if hash.has_key?(str)
|
147
|
+
hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
|
148
|
+
cc = cc.to_i
|
149
|
+
result[ii + pos] = cc if cc > result[ii + pos]
|
150
|
+
end
|
127
151
|
end
|
128
|
-
$stderr.print ": #{result.inspect}\n" if DEBUG
|
129
152
|
end
|
130
|
-
end
|
131
153
|
|
132
154
|
# Walk the word
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
155
|
+
(0..right_stop).each do |pos|
|
156
|
+
rest_length = word_size - pos
|
157
|
+
(1..rest_length).each do |length|
|
158
|
+
substr = letters[pos, length].join("")
|
159
|
+
updater[@language.hyphen, substr, pos]
|
160
|
+
updater[@language.start, substr, pos] if pos.zero?
|
161
|
+
updater[@language.stop, substr, pos] if length == rest_length
|
162
|
+
end
|
140
163
|
end
|
164
|
+
|
165
|
+
updater[@language.both, word, 0] if @language.both[word]
|
166
|
+
|
167
|
+
(0..@left).each { |i| result[i] = 0 }
|
168
|
+
((-1 - @right)..-1).each { |i| result[i] = 0 }
|
169
|
+
@cache[word] = make_result_list(result)
|
141
170
|
end
|
142
171
|
|
143
|
-
|
172
|
+
if points.length > 1
|
173
|
+
offset = 0
|
174
|
+
result = []
|
144
175
|
|
145
|
-
|
146
|
-
|
147
|
-
|
176
|
+
points.each_with_index do |word, i|
|
177
|
+
word.each do |pos|
|
178
|
+
result << pos + offset
|
179
|
+
end
|
180
|
+
|
181
|
+
offset += words[i].length + 1
|
182
|
+
end
|
183
|
+
|
184
|
+
result
|
185
|
+
else
|
186
|
+
points.flatten
|
187
|
+
end
|
148
188
|
end
|
149
189
|
|
150
190
|
# Returns a visualization of the hyphenation points.
|
@@ -157,8 +197,15 @@ class Text::Hyphen
|
|
157
197
|
#
|
158
198
|
# Because hyphenation can be expensive, if the word has been visualised
|
159
199
|
# previously, it will be returned from a per-instance cache.
|
160
|
-
|
200
|
+
#
|
201
|
+
# #visualise supports phrase hyphenation:
|
202
|
+
#
|
203
|
+
# hyp.hyphenate("This useful library supports phrases and sentences.")
|
204
|
+
# #=> This use-ful li-brary sup-port-s phras-es and sen-tences.
|
205
|
+
def visualise(word, hyphen = "-")
|
206
|
+
return visualise_phrase(word, hyphen) if phrase?(word)
|
161
207
|
return @vcache[word] if @vcache.has_key?(word)
|
208
|
+
|
162
209
|
w = word.dup
|
163
210
|
s = hyphen.size
|
164
211
|
hyphenate(w).each_with_index do |pos, n|
|
@@ -168,7 +215,7 @@ class Text::Hyphen
|
|
168
215
|
end
|
169
216
|
@vcache[word] = w
|
170
217
|
end
|
171
|
-
|
218
|
+
alias_method :visualize, :visualise
|
172
219
|
|
173
220
|
# Clears the per-instance hyphenation and visualization caches.
|
174
221
|
def clear_cache!
|
@@ -177,29 +224,33 @@ class Text::Hyphen
|
|
177
224
|
end
|
178
225
|
|
179
226
|
# This function will hyphenate a word so that the first point is at most
|
227
|
+
# +size+ characters.
|
180
228
|
#
|
181
229
|
# NOTE: if hyphen is set to a string, it will still be counted as one
|
182
230
|
# character (since it represents a hyphen)
|
183
231
|
#
|
184
|
-
#
|
185
|
-
|
232
|
+
# #hyphenate_to does not support phrase hyphenation and will throw an
|
233
|
+
# exception if there are spaces.
|
234
|
+
def hyphenate_to(word, size, hyphen = "-")
|
235
|
+
raise ArgumentError, "#hyphenate_to does not support phrases" if phrase?(word)
|
236
|
+
|
186
237
|
point = hyphenate(word).delete_if { |e| e >= size }.max
|
187
238
|
if point.nil?
|
188
239
|
[nil, word]
|
189
240
|
else
|
190
|
-
[word[0
|
241
|
+
[word[0...point] + hyphen, word[point..-1]]
|
191
242
|
end
|
192
243
|
end
|
193
244
|
|
194
245
|
# Returns a string describing the structure of the patterns for the
|
195
246
|
# language of this hyphenation object.
|
196
247
|
def stats
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
248
|
+
stats_both = @language.both.size
|
249
|
+
stats_start = @language.start.size
|
250
|
+
stats_end = @language.stop.size
|
251
|
+
stats_hyphens = @language.hyphen.size
|
252
|
+
stats_exceptions = @language.exceptions.size
|
253
|
+
stats_total = stats_both + stats_start + stats_end + stats_hyphens + stats_exceptions
|
203
254
|
|
204
255
|
s = <<-EOS
|
205
256
|
|
@@ -210,25 +261,13 @@ The language '%s' contains %d total hyphenation patterns.
|
|
210
261
|
% 6d patterns are normal patterns.
|
211
262
|
% 6d patterns are exceptions.
|
212
263
|
|
213
|
-
EOS
|
214
|
-
s % [
|
264
|
+
EOS
|
265
|
+
s % [@iso_language, stats_total, stats_start, stats_end, stats_both, stats_hyphens, stats_exceptions]
|
215
266
|
end
|
216
267
|
|
217
|
-
def updateresult(hash, str, pos)
|
218
|
-
if hash.has_key?(str)
|
219
|
-
STDERR.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
|
220
|
-
hash[str].scan(@language.scan_re).each_with_index do |c, i|
|
221
|
-
c = c.to_i
|
222
|
-
@result[i + pos] = c if c > @result[i + pos]
|
223
|
-
end
|
224
|
-
STDERR.puts ": #{@result}" if DEBUG
|
225
|
-
end
|
226
|
-
end
|
227
|
-
private :updateresult
|
228
|
-
|
229
268
|
def make_result_list(res)
|
230
269
|
r = []
|
231
|
-
res.each_with_index { |c, i| r <<
|
270
|
+
res.each_with_index { |c, i| r << i * (c.to_i % 2) }
|
232
271
|
r.reject { |i| i.to_i == 0 }
|
233
272
|
end
|
234
273
|
private :make_result_list
|
@@ -251,17 +290,20 @@ EOS
|
|
251
290
|
end
|
252
291
|
private :load_language
|
253
292
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
293
|
+
def split_phrase(phrase)
|
294
|
+
phrase.split(/[[:space:]]+/)
|
295
|
+
end
|
296
|
+
private :split_phrase
|
297
|
+
|
298
|
+
def visualise_phrase(phrase, hyphen)
|
299
|
+
split_phrase(phrase).map { |word| visualise(word, hyphen) }.join(" ")
|
300
|
+
end
|
301
|
+
private :visualise_phrase
|
302
|
+
|
303
|
+
def phrase?(input)
|
304
|
+
/[^[:space:]][[:space:]][^[:space:]]/.match?(input)
|
264
305
|
end
|
306
|
+
private :phrase?
|
265
307
|
end
|
266
308
|
|
267
309
|
# vim: syntax=ruby
|
data/lib/text-hyphen.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
# -*- ruby encoding: utf-8 -*-
|
2
|
-
require
|
2
|
+
require "text/hyphen"
|
@@ -1,10 +1,10 @@
|
|
1
|
-
# -*- encoding:
|
1
|
+
# -*- encoding: iso-8859-1 -*-
|
2
2
|
|
3
3
|
module TestTextHyphenData
|
4
4
|
def self.bug_9807_data
|
5
5
|
txt = "Dampfschifffahrtskapit�nsm�tzenhalterhersteller"
|
6
6
|
pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
|
7
7
|
viz = "Dampf-schiff-fahrts-ka-pi-t�ns-m�t-zen-hal-ter-her-stel-ler"
|
8
|
-
[
|
8
|
+
[txt, pts, viz]
|
9
9
|
end
|
10
10
|
end
|
data/test/data/bug_9807_utf-8.rb
CHANGED
data/test/test_bugs.rb
CHANGED
@@ -1,16 +1,17 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
require "text-hyphen"
|
4
5
|
|
5
6
|
# The behaviour of Text::Hyphen differs based on the version and the
|
6
|
-
# encoding. Ruby 1.8 fails if the input is not
|
7
|
-
# patterns are
|
8
|
-
data_version = if RUBY_VERSION <
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
data_path = File.join(File.dirname(__FILE__),
|
7
|
+
# encoding. Ruby 1.8 fails if the input is not iso-8859-1 and the hyphenation
|
8
|
+
# patterns are iso-8859-1. Ruby 1.9 always expects UTF-8 patterns.
|
9
|
+
data_version = if RUBY_VERSION < "1.9.1"
|
10
|
+
"iso-8859-1"
|
11
|
+
else
|
12
|
+
"utf-8"
|
13
|
+
end
|
14
|
+
data_path = File.join(File.dirname(__FILE__), "data")
|
14
15
|
load File.join(data_path, "bug_9807_#{data_version}.rb")
|
15
16
|
|
16
17
|
class TestTextHyphenBugs < Test::Unit::TestCase
|
@@ -19,17 +20,17 @@ class TestTextHyphenBugs < Test::Unit::TestCase
|
|
19
20
|
# http://rubyforge.org/tracker/index.php?func=detail&aid=28498&group_id=294&atid=1195
|
20
21
|
txt, pts, viz = TestTextHyphenData.bug_9807_data
|
21
22
|
|
22
|
-
de1 = Text::Hyphen.new(:language =>
|
23
|
+
de1 = Text::Hyphen.new(:language => "de")
|
23
24
|
assert_equal pts, de1.hyphenate(txt)
|
24
25
|
assert_equal viz, de1.visualize(txt)
|
25
26
|
|
26
|
-
de2 = Text::Hyphen.new(:language =>
|
27
|
+
de2 = Text::Hyphen.new(:language => "de2")
|
27
28
|
assert_equal pts, de2.hyphenate(txt)
|
28
29
|
assert_equal viz, de2.visualize(txt)
|
29
30
|
end
|
30
31
|
|
31
32
|
def test_rubyforge_28128
|
32
|
-
en_us = Text::Hyphen.new(:language =>
|
33
|
+
en_us = Text::Hyphen.new(:language => "en_us")
|
33
34
|
assert_equal [], en_us.hyphenate("to")
|
34
35
|
assert_equal "to", en_us.visualize("to")
|
35
36
|
end
|