text-hyphen 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.standard.yml +5 -0
- data/Code-of-Conduct.md +73 -0
- data/Contributing.md +68 -0
- data/History.md +139 -0
- data/Licence.md +159 -0
- data/Manifest.txt +12 -5
- data/README.md +81 -0
- data/Rakefile +68 -19
- data/bin/ruby-hyphen +0 -0
- data/lib/text/hyphen/language/1.8/de1.rb +1307 -571
- data/lib/text/hyphen/language/1.8/en_us.rb +412 -453
- data/lib/text/hyphen/language/1.8/fr.rb +128 -334
- data/lib/text/hyphen/language/1.8/la.rb +1 -0
- data/lib/text/hyphen/language/1.8/lt.rb +115 -0
- data/lib/text/hyphen/language/1.8/pt.rb +2 -1
- data/lib/text/hyphen/language/1.8/sk.rb +277 -0
- data/lib/text/hyphen/language/1.9/ca.rb +2 -1
- data/lib/text/hyphen/language/1.9/cs.rb +2 -1
- data/lib/text/hyphen/language/1.9/da.rb +2 -1
- data/lib/text/hyphen/language/1.9/de1.rb +1382 -646
- data/lib/text/hyphen/language/1.9/de2.rb +110 -109
- data/lib/text/hyphen/language/1.9/en_uk.rb +2 -1
- data/lib/text/hyphen/language/1.9/en_us.rb +412 -454
- data/lib/text/hyphen/language/1.9/es.rb +2 -1
- data/lib/text/hyphen/language/1.9/et.rb +6 -5
- data/lib/text/hyphen/language/1.9/eu.rb +4 -3
- data/lib/text/hyphen/language/1.9/fi.rb +3 -2
- data/lib/text/hyphen/language/1.9/fr.rb +136 -343
- data/lib/text/hyphen/language/1.9/ga.rb +27 -26
- data/lib/text/hyphen/language/1.9/hr.rb +6 -5
- data/lib/text/hyphen/language/1.9/hsb.rb +3 -2
- data/lib/text/hyphen/language/1.9/hu1.rb +3 -2
- data/lib/text/hyphen/language/1.9/hu2.rb +5 -4
- data/lib/text/hyphen/language/1.9/ia.rb +2 -1
- data/lib/text/hyphen/language/1.9/id.rb +8 -7
- data/lib/text/hyphen/language/1.9/is.rb +2 -1
- data/lib/text/hyphen/language/1.9/it.rb +74 -74
- data/lib/text/hyphen/language/1.9/la.rb +54 -53
- data/lib/text/hyphen/language/1.9/lt.rb +116 -0
- data/lib/text/hyphen/language/1.9/mn.rb +7 -6
- data/lib/text/hyphen/language/1.9/nl.rb +2 -1
- data/lib/text/hyphen/language/1.9/no1.rb +3 -2
- data/lib/text/hyphen/language/1.9/no2.rb +3 -2
- data/lib/text/hyphen/language/1.9/pl.rb +2 -1
- data/lib/text/hyphen/language/1.9/pt.rb +3 -2
- data/lib/text/hyphen/language/1.9/ru.rb +2 -1
- data/lib/text/hyphen/language/1.9/sk.rb +280 -0
- data/lib/text/hyphen/language/1.9/sv.rb +4 -3
- data/lib/text/hyphen/language/cs.rb +1 -1
- data/lib/text/hyphen/language/de.rb +2 -1
- data/lib/text/hyphen/language/de1.rb +1 -1
- data/lib/text/hyphen/language/de2.rb +1 -1
- data/lib/text/hyphen/language/en_us.rb +1 -1
- data/lib/text/hyphen/language/eu.rb +1 -1
- data/lib/text/hyphen/language/fr.rb +1 -1
- data/lib/text/hyphen/language/hu.rb +1 -1
- data/lib/text/hyphen/language/hu1.rb +1 -1
- data/lib/text/hyphen/language/hu2.rb +1 -1
- data/lib/text/hyphen/language/is.rb +1 -1
- data/lib/text/hyphen/language/lt.rb +4 -0
- data/lib/text/hyphen/language/ms.rb +3 -3
- data/lib/text/hyphen/language/nl.rb +1 -1
- data/lib/text/hyphen/language/no.rb +1 -1
- data/lib/text/hyphen/language/sk.rb +4 -0
- data/lib/text/hyphen/language.rb +45 -45
- data/lib/text/hyphen.rb +139 -97
- data/lib/text-hyphen.rb +1 -1
- data/test/data/bug_9807_latin1.rb +2 -2
- data/test/data/bug_9807_utf-8.rb +1 -1
- data/test/test_bugs.rb +14 -13
- data/test/test_text_hyphen.rb +31 -21
- metadata +146 -96
- data/.autotest +0 -23
- data/.gemtest +0 -0
- data/History.rdoc +0 -99
- data/License.rdoc +0 -159
- data/README.rdoc +0 -95
- data/text-hyphen.gemspec +0 -51
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
require
|
|
1
|
+
require "text/hyphen/language/id"
|
|
2
2
|
|
|
3
3
|
unless defined? Text::Hyphen::Language::MS
|
|
4
4
|
Text::Hyphen::Language::MS = Text::Hyphen::Language.new(Text::Hyphen::Language::ID) do |malay|
|
|
5
|
-
malay.isocode =
|
|
5
|
+
malay.isocode = "ms"
|
|
6
6
|
end
|
|
7
7
|
|
|
8
|
-
Text::Hyphen::Language.aliases_for "MS" => %W
|
|
8
|
+
Text::Hyphen::Language.aliases_for "MS" => %W[MAY MSA]
|
|
9
9
|
end
|
|
@@ -1 +1 @@
|
|
|
1
|
-
require
|
|
1
|
+
require "text/hyphen/language/no1"
|
data/lib/text/hyphen/language.rb
CHANGED
|
@@ -4,32 +4,32 @@
|
|
|
4
4
|
# patterns are defined as instances of this class—and only this class. This
|
|
5
5
|
# is a deliberate "breaking" of Ruby's concept of duck-typing and is
|
|
6
6
|
# intended to provide an indication that the patterns have been converted
|
|
7
|
-
# from TeX encodings to other encodings (e.g.,
|
|
7
|
+
# from TeX encodings to other encodings (e.g., iso-8859-1 or UTF-8) that are
|
|
8
8
|
# more suitable to general text manipulations.
|
|
9
9
|
class Text::Hyphen::Language
|
|
10
|
-
WORD_START_RE
|
|
11
|
-
WORD_END_RE
|
|
12
|
-
DIGIT_RE
|
|
13
|
-
NONDIGIT_RE
|
|
14
|
-
DASH_RE
|
|
15
|
-
EXCEPTION_DASH0_RE
|
|
16
|
-
EXCEPTION_DASH1_RE
|
|
17
|
-
EXCEPTION_NONUM_RE
|
|
18
|
-
ZERO_INSERT_RE
|
|
19
|
-
ZERO_START_RE
|
|
20
|
-
|
|
21
|
-
DEFAULT_ENCODING
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
10
|
+
WORD_START_RE = %r{^\.} # :nodoc:
|
|
11
|
+
WORD_END_RE = %r{\.$} # :nodoc:
|
|
12
|
+
DIGIT_RE = %r{\d} # :nodoc:
|
|
13
|
+
NONDIGIT_RE = %r{\D} # :nodoc:
|
|
14
|
+
DASH_RE = %r{-} # :nodoc:
|
|
15
|
+
EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} # :nodoc:
|
|
16
|
+
EXCEPTION_DASH1_RE = %r{[^-]-} # :nodoc:
|
|
17
|
+
EXCEPTION_NONUM_RE = %r{[^01]} # :nodoc:
|
|
18
|
+
ZERO_INSERT_RE = %r{(\D)(?=\D)} # :nodoc:
|
|
19
|
+
ZERO_START_RE = %r{^(?=\D)} # :nodoc:
|
|
20
|
+
|
|
21
|
+
DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" # :nodoc:
|
|
22
|
+
"iso-8859-1"
|
|
23
|
+
else
|
|
24
|
+
"utf-8"
|
|
25
|
+
end
|
|
26
26
|
|
|
27
27
|
# The character scan regular expression to use.
|
|
28
|
-
def scan_re
|
|
29
|
-
if RUBY_VERSION <
|
|
28
|
+
def scan_re # :nodoc:
|
|
29
|
+
if RUBY_VERSION < "1.9.1"
|
|
30
30
|
return %r{.}u if @encoding =~ /utf-?8/i
|
|
31
31
|
end
|
|
32
|
-
|
|
32
|
+
%r{.}
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
# The encoding of the hyphenation definitions. The text to be compared
|
|
@@ -66,21 +66,21 @@ class Text::Hyphen::Language
|
|
|
66
66
|
@pattern_text = pats.dup
|
|
67
67
|
|
|
68
68
|
@patterns = {
|
|
69
|
-
:both
|
|
70
|
-
:start
|
|
71
|
-
:stop
|
|
69
|
+
:both => {},
|
|
70
|
+
:start => {},
|
|
71
|
+
:stop => {},
|
|
72
72
|
:hyphen => {}
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
-
plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$},
|
|
75
|
+
plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, "") }
|
|
76
76
|
plist.each do |line|
|
|
77
77
|
line.split.each do |word|
|
|
78
78
|
next if word.empty?
|
|
79
79
|
|
|
80
80
|
start = stop = false
|
|
81
81
|
|
|
82
|
-
start = true if word.sub!(WORD_START_RE,
|
|
83
|
-
stop
|
|
82
|
+
start = true if word.sub!(WORD_START_RE, "")
|
|
83
|
+
stop = true if word.sub!(WORD_END_RE, "")
|
|
84
84
|
|
|
85
85
|
# Insert zeroes and start with some digit
|
|
86
86
|
word.gsub!(ZERO_INSERT_RE) { "#{$1}0" }
|
|
@@ -88,17 +88,17 @@ class Text::Hyphen::Language
|
|
|
88
88
|
|
|
89
89
|
# This assumes that the pattern lists are already in lowercase
|
|
90
90
|
# form only.
|
|
91
|
-
tag
|
|
92
|
-
value = word.gsub(NONDIGIT_RE,
|
|
91
|
+
tag = word.gsub(DIGIT_RE, "")
|
|
92
|
+
value = word.gsub(NONDIGIT_RE, "")
|
|
93
93
|
|
|
94
|
-
if start
|
|
95
|
-
|
|
94
|
+
set = if start && stop
|
|
95
|
+
:both
|
|
96
96
|
elsif start
|
|
97
|
-
|
|
97
|
+
:start
|
|
98
98
|
elsif stop
|
|
99
|
-
|
|
99
|
+
:stop
|
|
100
100
|
else
|
|
101
|
-
|
|
101
|
+
:hyphen
|
|
102
102
|
end
|
|
103
103
|
|
|
104
104
|
@patterns[set][tag] = value
|
|
@@ -116,10 +116,10 @@ class Text::Hyphen::Language
|
|
|
116
116
|
@exceptions = {}
|
|
117
117
|
|
|
118
118
|
@exception_text.split.each do |word|
|
|
119
|
-
tag
|
|
120
|
-
value = "0" + word.gsub(EXCEPTION_DASH0_RE,
|
|
121
|
-
value.gsub!(EXCEPTION_NONUM_RE,
|
|
122
|
-
@exceptions[tag] = value.scan(
|
|
119
|
+
tag = word.gsub(DASH_RE, "")
|
|
120
|
+
value = "0" + word.gsub(EXCEPTION_DASH0_RE, "0").gsub(EXCEPTION_DASH1_RE, "1")
|
|
121
|
+
value.gsub!(EXCEPTION_NONUM_RE, "0")
|
|
122
|
+
@exceptions[tag] = value.scan(scan_re).map { |c| c.to_i }
|
|
123
123
|
end
|
|
124
124
|
|
|
125
125
|
true
|
|
@@ -142,16 +142,16 @@ class Text::Hyphen::Language
|
|
|
142
142
|
# instance of Text::Hyphen::Language.
|
|
143
143
|
def initialize(language = nil)
|
|
144
144
|
if language.nil?
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
encoding DEFAULT_ENCODING
|
|
146
|
+
patterns ""
|
|
147
|
+
exceptions ""
|
|
148
148
|
self.left = 2
|
|
149
149
|
self.right = 2
|
|
150
150
|
self.isocode = nil
|
|
151
|
-
elsif language.
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
151
|
+
elsif language.is_a? Text::Hyphen::Language
|
|
152
|
+
encoding language.encoding
|
|
153
|
+
patterns language.instance_variable_get(:@pattern_text)
|
|
154
|
+
exceptions language.instance_variable_get(:@exception_text)
|
|
155
155
|
self.left = language.left
|
|
156
156
|
self.right = language.right
|
|
157
157
|
self.isocode = language.isocode
|
|
@@ -171,7 +171,7 @@ class Text::Hyphen::Language
|
|
|
171
171
|
end
|
|
172
172
|
language = const_get(language)
|
|
173
173
|
|
|
174
|
-
[
|
|
174
|
+
[alias_names].flatten.each do |alias_name|
|
|
175
175
|
next if const_defined? alias_name
|
|
176
176
|
const_set(alias_name, language)
|
|
177
177
|
end
|
data/lib/text/hyphen.rb
CHANGED
|
@@ -7,10 +7,21 @@ end
|
|
|
7
7
|
# hyphenation algorithm with pattern files. Each object is constructed with
|
|
8
8
|
# a specific language's hyphenation patterns.
|
|
9
9
|
class Text::Hyphen
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
# Resolves a file for cleaner loading from a hyphenation loader file.
|
|
11
|
+
def self.require_real_hyphenation_file(loader) # :nodoc:
|
|
12
|
+
p = File.dirname(loader)
|
|
13
|
+
f = File.basename(loader)
|
|
14
|
+
v = if RUBY_VERSION < "1.9.1"
|
|
15
|
+
"1.8"
|
|
16
|
+
else
|
|
17
|
+
"1.9"
|
|
18
|
+
end
|
|
19
|
+
require File.join(p, v, f)
|
|
20
|
+
end
|
|
12
21
|
|
|
13
|
-
|
|
22
|
+
VERSION = "1.5.0"
|
|
23
|
+
|
|
24
|
+
DEFAULT_MIN_LEFT = 2
|
|
14
25
|
DEFAULT_MIN_RIGHT = 2
|
|
15
26
|
|
|
16
27
|
# No fewer than this number of letters will show up to the left of the
|
|
@@ -26,31 +37,31 @@ class Text::Hyphen
|
|
|
26
37
|
# two or three character ISO 639 code, with the two character form being
|
|
27
38
|
# the canonical resource name. This will load the language hyphenation
|
|
28
39
|
# definitions from text/hyphen/language/<code> as a Ruby class. The
|
|
29
|
-
# resource
|
|
40
|
+
# resource "text/hyphen/language/en_us" defines the language class
|
|
30
41
|
# Text::Hyphen::Language::EN_US. It also defines the secondary forms
|
|
31
42
|
# Text::Hyphen::Language::EN and Text::Hyphen::Language::ENG_US.
|
|
32
43
|
#
|
|
33
44
|
# Minimal transformations will be performed on the language code provided,
|
|
34
|
-
# such that any dashes are converted to underscores (e.g.,
|
|
35
|
-
#
|
|
36
|
-
# downcased and class names will be converted to uppercase (e.g.,
|
|
37
|
-
# the Portuguese language becomes
|
|
45
|
+
# such that any dashes are converted to underscores (e.g., "en-us" becomes
|
|
46
|
+
# "en_us") and all characters are regularised. Resource names will be
|
|
47
|
+
# downcased and class names will be converted to uppercase (e.g., "Pt" for
|
|
48
|
+
# the Portuguese language becomes "pt" and "PT", respectively).
|
|
38
49
|
#
|
|
39
50
|
# The language may also be specified as an instance of
|
|
40
51
|
# Text::Hyphen::Language.
|
|
41
|
-
|
|
52
|
+
#
|
|
53
|
+
# :attr_accessor: language
|
|
54
|
+
attr_reader :language
|
|
42
55
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if lang.kind_of? Text::Hyphen::Language
|
|
56
|
+
def language=(lang) # :nodoc:
|
|
57
|
+
require "text/hyphen/language" unless defined?(Text::Hyphen::Language)
|
|
58
|
+
if lang.is_a? Text::Hyphen::Language
|
|
47
59
|
@iso_language = lang.to_s.split(%r{::}o)[-1].downcase
|
|
48
|
-
@language
|
|
60
|
+
@language = lang
|
|
49
61
|
else
|
|
50
62
|
@iso_language = lang.downcase
|
|
51
63
|
load_language
|
|
52
64
|
end
|
|
53
|
-
@iso_language
|
|
54
65
|
end
|
|
55
66
|
|
|
56
67
|
# Returns the language's ISO 639 ID, e.g., "en_us" or "pt".
|
|
@@ -70,23 +81,22 @@ class Text::Hyphen
|
|
|
70
81
|
# methods in an initialization block. The following initializations are
|
|
71
82
|
# all equivalent:
|
|
72
83
|
#
|
|
73
|
-
# hyp = Text::Hyphenate.new(:
|
|
74
|
-
# hyp = Text::Hyphenate.new
|
|
75
|
-
# hyp = Text::Hyphenate.new { |h| h.language = 'en_us' }
|
|
84
|
+
# hyp = Text::Hyphenate.new(language: "en_us")
|
|
85
|
+
# hyp = Text::Hyphenate.new { |h| h.language = "en_us" }
|
|
76
86
|
def initialize(options = {}) # :yields self:
|
|
77
87
|
@iso_language = options[:language]
|
|
78
|
-
@left
|
|
79
|
-
@right
|
|
80
|
-
@language
|
|
88
|
+
@left = options[:left]
|
|
89
|
+
@right = options[:right]
|
|
90
|
+
@language = nil
|
|
81
91
|
|
|
82
|
-
@cache
|
|
83
|
-
@vcache
|
|
92
|
+
@cache = {}
|
|
93
|
+
@vcache = {}
|
|
84
94
|
|
|
85
|
-
@hyphen
|
|
95
|
+
@hyphen = {}
|
|
86
96
|
@begin_hyphen = {}
|
|
87
|
-
@end_hyphen
|
|
88
|
-
@both_hyphen
|
|
89
|
-
@exception
|
|
97
|
+
@end_hyphen = {}
|
|
98
|
+
@both_hyphen = {}
|
|
99
|
+
@exception = {}
|
|
90
100
|
|
|
91
101
|
@first_load = true
|
|
92
102
|
yield self if block_given?
|
|
@@ -94,57 +104,87 @@ class Text::Hyphen
|
|
|
94
104
|
|
|
95
105
|
load_language
|
|
96
106
|
|
|
97
|
-
@left
|
|
107
|
+
@left ||= DEFAULT_MIN_LEFT
|
|
98
108
|
@right ||= DEFAULT_MIN_RIGHT
|
|
99
109
|
end
|
|
100
110
|
|
|
101
111
|
# Returns an array of character positions where a word can be hyphenated.
|
|
102
112
|
#
|
|
103
|
-
# hyp.hyphenate(
|
|
113
|
+
# hyp.hyphenate("representation") #=> [3, 5, 8 10]
|
|
104
114
|
#
|
|
105
115
|
# Because hyphenation can be expensive, if the word has been hyphenated
|
|
106
116
|
# previously, it will be returned from a per-instance cache.
|
|
117
|
+
#
|
|
118
|
+
# #hyphenate supports phrase hyphenation:
|
|
119
|
+
#
|
|
120
|
+
# hyp.hyphenate("This useful library supports phrases and sentences.")
|
|
121
|
+
# #=> [8, 14, 23, 27, 34, 44]
|
|
122
|
+
#
|
|
123
|
+
# When phrases are hyphenated, each word is processed individually and the
|
|
124
|
+
# result is returned as a single continuous list of hyphenation points.
|
|
107
125
|
def hyphenate(word)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
126
|
+
words = if phrase?(word)
|
|
127
|
+
word.downcase.split(/[[:space:]]/)
|
|
128
|
+
else
|
|
129
|
+
[word.downcase]
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
points = words.map do |word|
|
|
133
|
+
next @cache[word] if @cache.has_key?(word)
|
|
134
|
+
|
|
135
|
+
if (exception = @language.exceptions[word])
|
|
136
|
+
next @cache[word] = make_result_list(exception)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
letters = word.scan(@language.scan_re)
|
|
140
|
+
word_size = letters.size
|
|
141
|
+
|
|
142
|
+
result = [0] * (word_size + 1)
|
|
143
|
+
right_stop = word_size - @right
|
|
144
|
+
|
|
145
|
+
updater = proc do |hash, str, pos|
|
|
146
|
+
if hash.has_key?(str)
|
|
147
|
+
hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
|
|
148
|
+
cc = cc.to_i
|
|
149
|
+
result[ii + pos] = cc if cc > result[ii + pos]
|
|
150
|
+
end
|
|
127
151
|
end
|
|
128
|
-
$stderr.print ": #{result.inspect}\n" if DEBUG
|
|
129
152
|
end
|
|
130
|
-
end
|
|
131
153
|
|
|
132
154
|
# Walk the word
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
155
|
+
(0..right_stop).each do |pos|
|
|
156
|
+
rest_length = word_size - pos
|
|
157
|
+
(1..rest_length).each do |length|
|
|
158
|
+
substr = letters[pos, length].join("")
|
|
159
|
+
updater[@language.hyphen, substr, pos]
|
|
160
|
+
updater[@language.start, substr, pos] if pos.zero?
|
|
161
|
+
updater[@language.stop, substr, pos] if length == rest_length
|
|
162
|
+
end
|
|
140
163
|
end
|
|
164
|
+
|
|
165
|
+
updater[@language.both, word, 0] if @language.both[word]
|
|
166
|
+
|
|
167
|
+
(0..@left).each { |i| result[i] = 0 }
|
|
168
|
+
((-1 - @right)..-1).each { |i| result[i] = 0 }
|
|
169
|
+
@cache[word] = make_result_list(result)
|
|
141
170
|
end
|
|
142
171
|
|
|
143
|
-
|
|
172
|
+
if points.length > 1
|
|
173
|
+
offset = 0
|
|
174
|
+
result = []
|
|
144
175
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
176
|
+
points.each_with_index do |word, i|
|
|
177
|
+
word.each do |pos|
|
|
178
|
+
result << pos + offset
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
offset += words[i].length + 1
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
result
|
|
185
|
+
else
|
|
186
|
+
points.flatten
|
|
187
|
+
end
|
|
148
188
|
end
|
|
149
189
|
|
|
150
190
|
# Returns a visualization of the hyphenation points.
|
|
@@ -157,8 +197,15 @@ class Text::Hyphen
|
|
|
157
197
|
#
|
|
158
198
|
# Because hyphenation can be expensive, if the word has been visualised
|
|
159
199
|
# previously, it will be returned from a per-instance cache.
|
|
160
|
-
|
|
200
|
+
#
|
|
201
|
+
# #visualise supports phrase hyphenation:
|
|
202
|
+
#
|
|
203
|
+
# hyp.hyphenate("This useful library supports phrases and sentences.")
|
|
204
|
+
# #=> This use-ful li-brary sup-port-s phras-es and sen-tences.
|
|
205
|
+
def visualise(word, hyphen = "-")
|
|
206
|
+
return visualise_phrase(word, hyphen) if phrase?(word)
|
|
161
207
|
return @vcache[word] if @vcache.has_key?(word)
|
|
208
|
+
|
|
162
209
|
w = word.dup
|
|
163
210
|
s = hyphen.size
|
|
164
211
|
hyphenate(w).each_with_index do |pos, n|
|
|
@@ -168,7 +215,7 @@ class Text::Hyphen
|
|
|
168
215
|
end
|
|
169
216
|
@vcache[word] = w
|
|
170
217
|
end
|
|
171
|
-
|
|
218
|
+
alias_method :visualize, :visualise
|
|
172
219
|
|
|
173
220
|
# Clears the per-instance hyphenation and visualization caches.
|
|
174
221
|
def clear_cache!
|
|
@@ -177,29 +224,33 @@ class Text::Hyphen
|
|
|
177
224
|
end
|
|
178
225
|
|
|
179
226
|
# This function will hyphenate a word so that the first point is at most
|
|
227
|
+
# +size+ characters.
|
|
180
228
|
#
|
|
181
229
|
# NOTE: if hyphen is set to a string, it will still be counted as one
|
|
182
230
|
# character (since it represents a hyphen)
|
|
183
231
|
#
|
|
184
|
-
#
|
|
185
|
-
|
|
232
|
+
# #hyphenate_to does not support phrase hyphenation and will throw an
|
|
233
|
+
# exception if there are spaces.
|
|
234
|
+
def hyphenate_to(word, size, hyphen = "-")
|
|
235
|
+
raise ArgumentError, "#hyphenate_to does not support phrases" if phrase?(word)
|
|
236
|
+
|
|
186
237
|
point = hyphenate(word).delete_if { |e| e >= size }.max
|
|
187
238
|
if point.nil?
|
|
188
239
|
[nil, word]
|
|
189
240
|
else
|
|
190
|
-
[word[0
|
|
241
|
+
[word[0...point] + hyphen, word[point..-1]]
|
|
191
242
|
end
|
|
192
243
|
end
|
|
193
244
|
|
|
194
245
|
# Returns a string describing the structure of the patterns for the
|
|
195
246
|
# language of this hyphenation object.
|
|
196
247
|
def stats
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
248
|
+
stats_both = @language.both.size
|
|
249
|
+
stats_start = @language.start.size
|
|
250
|
+
stats_end = @language.stop.size
|
|
251
|
+
stats_hyphens = @language.hyphen.size
|
|
252
|
+
stats_exceptions = @language.exceptions.size
|
|
253
|
+
stats_total = stats_both + stats_start + stats_end + stats_hyphens + stats_exceptions
|
|
203
254
|
|
|
204
255
|
s = <<-EOS
|
|
205
256
|
|
|
@@ -210,25 +261,13 @@ The language '%s' contains %d total hyphenation patterns.
|
|
|
210
261
|
% 6d patterns are normal patterns.
|
|
211
262
|
% 6d patterns are exceptions.
|
|
212
263
|
|
|
213
|
-
EOS
|
|
214
|
-
s % [
|
|
264
|
+
EOS
|
|
265
|
+
s % [@iso_language, stats_total, stats_start, stats_end, stats_both, stats_hyphens, stats_exceptions]
|
|
215
266
|
end
|
|
216
267
|
|
|
217
|
-
def updateresult(hash, str, pos)
|
|
218
|
-
if hash.has_key?(str)
|
|
219
|
-
STDERR.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
|
|
220
|
-
hash[str].scan(@language.scan_re).each_with_index do |c, i|
|
|
221
|
-
c = c.to_i
|
|
222
|
-
@result[i + pos] = c if c > @result[i + pos]
|
|
223
|
-
end
|
|
224
|
-
STDERR.puts ": #{@result}" if DEBUG
|
|
225
|
-
end
|
|
226
|
-
end
|
|
227
|
-
private :updateresult
|
|
228
|
-
|
|
229
268
|
def make_result_list(res)
|
|
230
269
|
r = []
|
|
231
|
-
res.each_with_index { |c, i| r <<
|
|
270
|
+
res.each_with_index { |c, i| r << i * (c.to_i % 2) }
|
|
232
271
|
r.reject { |i| i.to_i == 0 }
|
|
233
272
|
end
|
|
234
273
|
private :make_result_list
|
|
@@ -251,17 +290,20 @@ EOS
|
|
|
251
290
|
end
|
|
252
291
|
private :load_language
|
|
253
292
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
293
|
+
def split_phrase(phrase)
|
|
294
|
+
phrase.split(/[[:space:]]+/)
|
|
295
|
+
end
|
|
296
|
+
private :split_phrase
|
|
297
|
+
|
|
298
|
+
def visualise_phrase(phrase, hyphen)
|
|
299
|
+
split_phrase(phrase).map { |word| visualise(word, hyphen) }.join(" ")
|
|
300
|
+
end
|
|
301
|
+
private :visualise_phrase
|
|
302
|
+
|
|
303
|
+
def phrase?(input)
|
|
304
|
+
/[^[:space:]][[:space:]][^[:space:]]/.match?(input)
|
|
264
305
|
end
|
|
306
|
+
private :phrase?
|
|
265
307
|
end
|
|
266
308
|
|
|
267
309
|
# vim: syntax=ruby
|
data/lib/text-hyphen.rb
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# -*- ruby encoding: utf-8 -*-
|
|
2
|
-
require
|
|
2
|
+
require "text/hyphen"
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# -*- encoding:
|
|
1
|
+
# -*- encoding: iso-8859-1 -*-
|
|
2
2
|
|
|
3
3
|
module TestTextHyphenData
|
|
4
4
|
def self.bug_9807_data
|
|
5
5
|
txt = "Dampfschifffahrtskapit�nsm�tzenhalterhersteller"
|
|
6
6
|
pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
|
|
7
7
|
viz = "Dampf-schiff-fahrts-ka-pi-t�ns-m�t-zen-hal-ter-her-stel-ler"
|
|
8
|
-
[
|
|
8
|
+
[txt, pts, viz]
|
|
9
9
|
end
|
|
10
10
|
end
|
data/test/data/bug_9807_utf-8.rb
CHANGED
data/test/test_bugs.rb
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
require
|
|
2
|
+
|
|
3
|
+
require "test/unit"
|
|
4
|
+
require "text-hyphen"
|
|
4
5
|
|
|
5
6
|
# The behaviour of Text::Hyphen differs based on the version and the
|
|
6
|
-
# encoding. Ruby 1.8 fails if the input is not
|
|
7
|
-
# patterns are
|
|
8
|
-
data_version = if RUBY_VERSION <
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
data_path = File.join(File.dirname(__FILE__),
|
|
7
|
+
# encoding. Ruby 1.8 fails if the input is not iso-8859-1 and the hyphenation
|
|
8
|
+
# patterns are iso-8859-1. Ruby 1.9 always expects UTF-8 patterns.
|
|
9
|
+
data_version = if RUBY_VERSION < "1.9.1"
|
|
10
|
+
"iso-8859-1"
|
|
11
|
+
else
|
|
12
|
+
"utf-8"
|
|
13
|
+
end
|
|
14
|
+
data_path = File.join(File.dirname(__FILE__), "data")
|
|
14
15
|
load File.join(data_path, "bug_9807_#{data_version}.rb")
|
|
15
16
|
|
|
16
17
|
class TestTextHyphenBugs < Test::Unit::TestCase
|
|
@@ -19,17 +20,17 @@ class TestTextHyphenBugs < Test::Unit::TestCase
|
|
|
19
20
|
# http://rubyforge.org/tracker/index.php?func=detail&aid=28498&group_id=294&atid=1195
|
|
20
21
|
txt, pts, viz = TestTextHyphenData.bug_9807_data
|
|
21
22
|
|
|
22
|
-
de1 = Text::Hyphen.new(:language =>
|
|
23
|
+
de1 = Text::Hyphen.new(:language => "de")
|
|
23
24
|
assert_equal pts, de1.hyphenate(txt)
|
|
24
25
|
assert_equal viz, de1.visualize(txt)
|
|
25
26
|
|
|
26
|
-
de2 = Text::Hyphen.new(:language =>
|
|
27
|
+
de2 = Text::Hyphen.new(:language => "de2")
|
|
27
28
|
assert_equal pts, de2.hyphenate(txt)
|
|
28
29
|
assert_equal viz, de2.visualize(txt)
|
|
29
30
|
end
|
|
30
31
|
|
|
31
32
|
def test_rubyforge_28128
|
|
32
|
-
en_us = Text::Hyphen.new(:language =>
|
|
33
|
+
en_us = Text::Hyphen.new(:language => "en_us")
|
|
33
34
|
assert_equal [], en_us.hyphenate("to")
|
|
34
35
|
assert_equal "to", en_us.visualize("to")
|
|
35
36
|
end
|