text-hyphen 1.4.1 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.standard.yml +5 -0
  3. data/Code-of-Conduct.md +73 -0
  4. data/Contributing.md +68 -0
  5. data/History.md +139 -0
  6. data/Licence.md +159 -0
  7. data/Manifest.txt +12 -5
  8. data/README.md +81 -0
  9. data/Rakefile +68 -19
  10. data/bin/ruby-hyphen +0 -0
  11. data/lib/text/hyphen/language/1.8/de1.rb +1307 -571
  12. data/lib/text/hyphen/language/1.8/en_us.rb +412 -453
  13. data/lib/text/hyphen/language/1.8/fr.rb +128 -334
  14. data/lib/text/hyphen/language/1.8/la.rb +1 -0
  15. data/lib/text/hyphen/language/1.8/lt.rb +115 -0
  16. data/lib/text/hyphen/language/1.8/pt.rb +2 -1
  17. data/lib/text/hyphen/language/1.8/sk.rb +277 -0
  18. data/lib/text/hyphen/language/1.9/ca.rb +2 -1
  19. data/lib/text/hyphen/language/1.9/cs.rb +2 -1
  20. data/lib/text/hyphen/language/1.9/da.rb +2 -1
  21. data/lib/text/hyphen/language/1.9/de1.rb +1382 -646
  22. data/lib/text/hyphen/language/1.9/de2.rb +110 -109
  23. data/lib/text/hyphen/language/1.9/en_uk.rb +2 -1
  24. data/lib/text/hyphen/language/1.9/en_us.rb +412 -454
  25. data/lib/text/hyphen/language/1.9/es.rb +2 -1
  26. data/lib/text/hyphen/language/1.9/et.rb +6 -5
  27. data/lib/text/hyphen/language/1.9/eu.rb +4 -3
  28. data/lib/text/hyphen/language/1.9/fi.rb +3 -2
  29. data/lib/text/hyphen/language/1.9/fr.rb +136 -343
  30. data/lib/text/hyphen/language/1.9/ga.rb +27 -26
  31. data/lib/text/hyphen/language/1.9/hr.rb +6 -5
  32. data/lib/text/hyphen/language/1.9/hsb.rb +3 -2
  33. data/lib/text/hyphen/language/1.9/hu1.rb +3 -2
  34. data/lib/text/hyphen/language/1.9/hu2.rb +5 -4
  35. data/lib/text/hyphen/language/1.9/ia.rb +2 -1
  36. data/lib/text/hyphen/language/1.9/id.rb +8 -7
  37. data/lib/text/hyphen/language/1.9/is.rb +2 -1
  38. data/lib/text/hyphen/language/1.9/it.rb +74 -74
  39. data/lib/text/hyphen/language/1.9/la.rb +54 -53
  40. data/lib/text/hyphen/language/1.9/lt.rb +116 -0
  41. data/lib/text/hyphen/language/1.9/mn.rb +7 -6
  42. data/lib/text/hyphen/language/1.9/nl.rb +2 -1
  43. data/lib/text/hyphen/language/1.9/no1.rb +3 -2
  44. data/lib/text/hyphen/language/1.9/no2.rb +3 -2
  45. data/lib/text/hyphen/language/1.9/pl.rb +2 -1
  46. data/lib/text/hyphen/language/1.9/pt.rb +3 -2
  47. data/lib/text/hyphen/language/1.9/ru.rb +2 -1
  48. data/lib/text/hyphen/language/1.9/sk.rb +280 -0
  49. data/lib/text/hyphen/language/1.9/sv.rb +4 -3
  50. data/lib/text/hyphen/language/cs.rb +1 -1
  51. data/lib/text/hyphen/language/de.rb +2 -1
  52. data/lib/text/hyphen/language/de1.rb +1 -1
  53. data/lib/text/hyphen/language/de2.rb +1 -1
  54. data/lib/text/hyphen/language/en_us.rb +1 -1
  55. data/lib/text/hyphen/language/eu.rb +1 -1
  56. data/lib/text/hyphen/language/fr.rb +1 -1
  57. data/lib/text/hyphen/language/hu.rb +1 -1
  58. data/lib/text/hyphen/language/hu1.rb +1 -1
  59. data/lib/text/hyphen/language/hu2.rb +1 -1
  60. data/lib/text/hyphen/language/is.rb +1 -1
  61. data/lib/text/hyphen/language/lt.rb +4 -0
  62. data/lib/text/hyphen/language/ms.rb +3 -3
  63. data/lib/text/hyphen/language/nl.rb +1 -1
  64. data/lib/text/hyphen/language/no.rb +1 -1
  65. data/lib/text/hyphen/language/sk.rb +4 -0
  66. data/lib/text/hyphen/language.rb +45 -45
  67. data/lib/text/hyphen.rb +139 -97
  68. data/lib/text-hyphen.rb +1 -1
  69. data/test/data/bug_9807_latin1.rb +2 -2
  70. data/test/data/bug_9807_utf-8.rb +1 -1
  71. data/test/test_bugs.rb +14 -13
  72. data/test/test_text_hyphen.rb +31 -21
  73. metadata +146 -96
  74. data/.autotest +0 -23
  75. data/.gemtest +0 -0
  76. data/History.rdoc +0 -99
  77. data/License.rdoc +0 -159
  78. data/README.rdoc +0 -95
  79. data/text-hyphen.gemspec +0 -51
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "HU2" => %W(HUN HU)
4
+ Text::Hyphen::Language.aliases_for "HU2" => %W[HUN HU]
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "IS" => %W(ICE ISL)
4
+ Text::Hyphen::Language.aliases_for "IS" => %W[ICE ISL]
@@ -0,0 +1,4 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
+ Text::Hyphen::Language.aliases_for "LT" => "LTU"
@@ -1,9 +1,9 @@
1
- require 'text/hyphen/language/id'
1
+ require "text/hyphen/language/id"
2
2
 
3
3
  unless defined? Text::Hyphen::Language::MS
4
4
  Text::Hyphen::Language::MS = Text::Hyphen::Language.new(Text::Hyphen::Language::ID) do |malay|
5
- malay.isocode = 'ms'
5
+ malay.isocode = "ms"
6
6
  end
7
7
 
8
- Text::Hyphen::Language.aliases_for "MS" => %W(MAY MSA)
8
+ Text::Hyphen::Language.aliases_for "MS" => %W[MAY MSA]
9
9
  end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "NL" => %W(DUT NLD)
4
+ Text::Hyphen::Language.aliases_for "NL" => %W[DUT NLD]
@@ -1 +1 @@
1
- require 'text/hyphen/language/no1'
1
+ require "text/hyphen/language/no1"
@@ -0,0 +1,4 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
+ Text::Hyphen::Language.aliases_for "SK" => "SVK"
@@ -4,32 +4,32 @@
4
4
  # patterns are defined as instances of this class—and only this class. This
5
5
  # is a deliberate "breaking" of Ruby's concept of duck-typing and is
6
6
  # intended to provide an indication that the patterns have been converted
7
- # from TeX encodings to other encodings (e.g., latin1 or UTF-8) that are
7
+ # from TeX encodings to other encodings (e.g., iso-8859-1 or UTF-8) that are
8
8
  # more suitable to general text manipulations.
9
9
  class Text::Hyphen::Language
10
- WORD_START_RE = %r{^\.} #:nodoc:
11
- WORD_END_RE = %r{\.$} #:nodoc:
12
- DIGIT_RE = %r{\d} #:nodoc:
13
- NONDIGIT_RE = %r{\D} #:nodoc:
14
- DASH_RE = %r{-} #:nodoc:
15
- EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} #:nodoc:
16
- EXCEPTION_DASH1_RE = %r{[^-]-} #:nodoc:
17
- EXCEPTION_NONUM_RE = %r{[^01]} #:nodoc:
18
- ZERO_INSERT_RE = %r{(\D)(?=\D)} #:nodoc:
19
- ZERO_START_RE = %r{^(?=\D)} #:nodoc:
20
-
21
- DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" #:nodoc:
22
- "latin1"
23
- else
24
- "utf-8"
25
- end
10
+ WORD_START_RE = %r{^\.} # :nodoc:
11
+ WORD_END_RE = %r{\.$} # :nodoc:
12
+ DIGIT_RE = %r{\d} # :nodoc:
13
+ NONDIGIT_RE = %r{\D} # :nodoc:
14
+ DASH_RE = %r{-} # :nodoc:
15
+ EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} # :nodoc:
16
+ EXCEPTION_DASH1_RE = %r{[^-]-} # :nodoc:
17
+ EXCEPTION_NONUM_RE = %r{[^01]} # :nodoc:
18
+ ZERO_INSERT_RE = %r{(\D)(?=\D)} # :nodoc:
19
+ ZERO_START_RE = %r{^(?=\D)} # :nodoc:
20
+
21
+ DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" # :nodoc:
22
+ "iso-8859-1"
23
+ else
24
+ "utf-8"
25
+ end
26
26
 
27
27
  # The character scan regular expression to use.
28
- def scan_re #:nodoc:
29
- if RUBY_VERSION < '1.9.1'
28
+ def scan_re # :nodoc:
29
+ if RUBY_VERSION < "1.9.1"
30
30
  return %r{.}u if @encoding =~ /utf-?8/i
31
31
  end
32
- return %r{.}
32
+ %r{.}
33
33
  end
34
34
 
35
35
  # The encoding of the hyphenation definitions. The text to be compared
@@ -66,21 +66,21 @@ class Text::Hyphen::Language
66
66
  @pattern_text = pats.dup
67
67
 
68
68
  @patterns = {
69
- :both => {},
70
- :start => {},
71
- :stop => {},
69
+ :both => {},
70
+ :start => {},
71
+ :stop => {},
72
72
  :hyphen => {}
73
73
  }
74
74
 
75
- plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, '') }
75
+ plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, "") }
76
76
  plist.each do |line|
77
77
  line.split.each do |word|
78
78
  next if word.empty?
79
79
 
80
80
  start = stop = false
81
81
 
82
- start = true if word.sub!(WORD_START_RE, '')
83
- stop = true if word.sub!(WORD_END_RE, '')
82
+ start = true if word.sub!(WORD_START_RE, "")
83
+ stop = true if word.sub!(WORD_END_RE, "")
84
84
 
85
85
  # Insert zeroes and start with some digit
86
86
  word.gsub!(ZERO_INSERT_RE) { "#{$1}0" }
@@ -88,17 +88,17 @@ class Text::Hyphen::Language
88
88
 
89
89
  # This assumes that the pattern lists are already in lowercase
90
90
  # form only.
91
- tag = word.gsub(DIGIT_RE, '')
92
- value = word.gsub(NONDIGIT_RE, '')
91
+ tag = word.gsub(DIGIT_RE, "")
92
+ value = word.gsub(NONDIGIT_RE, "")
93
93
 
94
- if start and stop
95
- set = :both
94
+ set = if start && stop
95
+ :both
96
96
  elsif start
97
- set = :start
97
+ :start
98
98
  elsif stop
99
- set = :stop
99
+ :stop
100
100
  else
101
- set = :hyphen
101
+ :hyphen
102
102
  end
103
103
 
104
104
  @patterns[set][tag] = value
@@ -116,10 +116,10 @@ class Text::Hyphen::Language
116
116
  @exceptions = {}
117
117
 
118
118
  @exception_text.split.each do |word|
119
- tag = word.gsub(DASH_RE,'')
120
- value = "0" + word.gsub(EXCEPTION_DASH0_RE, '0').gsub(EXCEPTION_DASH1_RE, '1')
121
- value.gsub!(EXCEPTION_NONUM_RE, '0')
122
- @exceptions[tag] = value.scan(self.scan_re).map { |c| c.to_i }
119
+ tag = word.gsub(DASH_RE, "")
120
+ value = "0" + word.gsub(EXCEPTION_DASH0_RE, "0").gsub(EXCEPTION_DASH1_RE, "1")
121
+ value.gsub!(EXCEPTION_NONUM_RE, "0")
122
+ @exceptions[tag] = value.scan(scan_re).map { |c| c.to_i }
123
123
  end
124
124
 
125
125
  true
@@ -142,16 +142,16 @@ class Text::Hyphen::Language
142
142
  # instance of Text::Hyphen::Language.
143
143
  def initialize(language = nil)
144
144
  if language.nil?
145
- self.encoding DEFAULT_ENCODING
146
- self.patterns ""
147
- self.exceptions ""
145
+ encoding DEFAULT_ENCODING
146
+ patterns ""
147
+ exceptions ""
148
148
  self.left = 2
149
149
  self.right = 2
150
150
  self.isocode = nil
151
- elsif language.kind_of? Text::Hyphen::Language
152
- self.encoding language.encoding
153
- self.patterns language.instance_variable_get(:@pattern_text)
154
- self.exceptions language.instance_variable_get(:@exception_text)
151
+ elsif language.is_a? Text::Hyphen::Language
152
+ encoding language.encoding
153
+ patterns language.instance_variable_get(:@pattern_text)
154
+ exceptions language.instance_variable_get(:@exception_text)
155
155
  self.left = language.left
156
156
  self.right = language.right
157
157
  self.isocode = language.isocode
@@ -171,7 +171,7 @@ class Text::Hyphen::Language
171
171
  end
172
172
  language = const_get(language)
173
173
 
174
- [ alias_names ].flatten.each do |alias_name|
174
+ [alias_names].flatten.each do |alias_name|
175
175
  next if const_defined? alias_name
176
176
  const_set(alias_name, language)
177
177
  end
data/lib/text/hyphen.rb CHANGED
@@ -7,10 +7,21 @@ end
7
7
  # hyphenation algorithm with pattern files. Each object is constructed with
8
8
  # a specific language's hyphenation patterns.
9
9
  class Text::Hyphen
10
- DEBUG = false
11
- VERSION = '1.4.1'
10
+ # Resolves a file for cleaner loading from a hyphenation loader file.
11
+ def self.require_real_hyphenation_file(loader) # :nodoc:
12
+ p = File.dirname(loader)
13
+ f = File.basename(loader)
14
+ v = if RUBY_VERSION < "1.9.1"
15
+ "1.8"
16
+ else
17
+ "1.9"
18
+ end
19
+ require File.join(p, v, f)
20
+ end
12
21
 
13
- DEFAULT_MIN_LEFT = 2
22
+ VERSION = "1.5.0"
23
+
24
+ DEFAULT_MIN_LEFT = 2
14
25
  DEFAULT_MIN_RIGHT = 2
15
26
 
16
27
  # No fewer than this number of letters will show up to the left of the
@@ -26,31 +37,31 @@ class Text::Hyphen
26
37
  # two or three character ISO 639 code, with the two character form being
27
38
  # the canonical resource name. This will load the language hyphenation
28
39
  # definitions from text/hyphen/language/&lt;code&gt; as a Ruby class. The
29
- # resource 'text/hyphen/language/en_us' defines the language class
40
+ # resource "text/hyphen/language/en_us" defines the language class
30
41
  # Text::Hyphen::Language::EN_US. It also defines the secondary forms
31
42
  # Text::Hyphen::Language::EN and Text::Hyphen::Language::ENG_US.
32
43
  #
33
44
  # Minimal transformations will be performed on the language code provided,
34
- # such that any dashes are converted to underscores (e.g., 'en-us' becomes
35
- # 'en_us') and all characters are regularised. Resource names will be
36
- # downcased and class names will be converted to uppercase (e.g., 'Pt' for
37
- # the Portuguese language becomes 'pt' and 'PT', respectively).
45
+ # such that any dashes are converted to underscores (e.g., "en-us" becomes
46
+ # "en_us") and all characters are regularised. Resource names will be
47
+ # downcased and class names will be converted to uppercase (e.g., "Pt" for
48
+ # the Portuguese language becomes "pt" and "PT", respectively).
38
49
  #
39
50
  # The language may also be specified as an instance of
40
51
  # Text::Hyphen::Language.
41
- attr_accessor :language
52
+ #
53
+ # :attr_accessor: language
54
+ attr_reader :language
42
55
 
43
- undef :language=
44
- def language=(lang) #:nodoc:
45
- require 'text/hyphen/language' unless defined?(Text::Hyphen::Language)
46
- if lang.kind_of? Text::Hyphen::Language
56
+ def language=(lang) # :nodoc:
57
+ require "text/hyphen/language" unless defined?(Text::Hyphen::Language)
58
+ if lang.is_a? Text::Hyphen::Language
47
59
  @iso_language = lang.to_s.split(%r{::}o)[-1].downcase
48
- @language = lang
60
+ @language = lang
49
61
  else
50
62
  @iso_language = lang.downcase
51
63
  load_language
52
64
  end
53
- @iso_language
54
65
  end
55
66
 
56
67
  # Returns the language's ISO 639 ID, e.g., "en_us" or "pt".
@@ -70,23 +81,22 @@ class Text::Hyphen
70
81
  # methods in an initialization block. The following initializations are
71
82
  # all equivalent:
72
83
  #
73
- # hyp = Text::Hyphenate.new(:language => 'en_us')
74
- # hyp = Text::Hyphenate.new(language: 'en_us') # under Ruby 1.9
75
- # hyp = Text::Hyphenate.new { |h| h.language = 'en_us' }
84
+ # hyp = Text::Hyphenate.new(language: "en_us")
85
+ # hyp = Text::Hyphenate.new { |h| h.language = "en_us" }
76
86
  def initialize(options = {}) # :yields self:
77
87
  @iso_language = options[:language]
78
- @left = options[:left]
79
- @right = options[:right]
80
- @language = nil
88
+ @left = options[:left]
89
+ @right = options[:right]
90
+ @language = nil
81
91
 
82
- @cache = {}
83
- @vcache = {}
92
+ @cache = {}
93
+ @vcache = {}
84
94
 
85
- @hyphen = {}
95
+ @hyphen = {}
86
96
  @begin_hyphen = {}
87
- @end_hyphen = {}
88
- @both_hyphen = {}
89
- @exception = {}
97
+ @end_hyphen = {}
98
+ @both_hyphen = {}
99
+ @exception = {}
90
100
 
91
101
  @first_load = true
92
102
  yield self if block_given?
@@ -94,57 +104,87 @@ class Text::Hyphen
94
104
 
95
105
  load_language
96
106
 
97
- @left ||= DEFAULT_MIN_LEFT
107
+ @left ||= DEFAULT_MIN_LEFT
98
108
  @right ||= DEFAULT_MIN_RIGHT
99
109
  end
100
110
 
101
111
  # Returns an array of character positions where a word can be hyphenated.
102
112
  #
103
- # hyp.hyphenate('representation') #=> [3, 5, 8 10]
113
+ # hyp.hyphenate("representation") #=> [3, 5, 8 10]
104
114
  #
105
115
  # Because hyphenation can be expensive, if the word has been hyphenated
106
116
  # previously, it will be returned from a per-instance cache.
117
+ #
118
+ # #hyphenate supports phrase hyphenation:
119
+ #
120
+ # hyp.hyphenate("This useful library supports phrases and sentences.")
121
+ # #=> [8, 14, 23, 27, 34, 44]
122
+ #
123
+ # When phrases are hyphenated, each word is processed individually and the
124
+ # result is returned as a single continuous list of hyphenation points.
107
125
  def hyphenate(word)
108
- word = word.downcase
109
- $stderr.puts "Hyphenating #{word}" if DEBUG
110
- return @cache[word] if @cache.has_key?(word)
111
- res = @language.exceptions[word]
112
- return @cache[word] = make_result_list(res) if res
113
-
114
- letters = word.scan(@language.scan_re)
115
- $stderr.puts letters.inspect if DEBUG
116
- word_size = letters.size
117
-
118
- result = [0] * (word_size + 1)
119
- right_stop = word_size - @right
120
-
121
- updater = Proc.new do |hash, str, pos|
122
- if hash.has_key?(str)
123
- $stderr.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
124
- hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
125
- cc = cc.to_i
126
- result[ii + pos] = cc if cc > result[ii + pos]
126
+ words = if phrase?(word)
127
+ word.downcase.split(/[[:space:]]/)
128
+ else
129
+ [word.downcase]
130
+ end
131
+
132
+ points = words.map do |word|
133
+ next @cache[word] if @cache.has_key?(word)
134
+
135
+ if (exception = @language.exceptions[word])
136
+ next @cache[word] = make_result_list(exception)
137
+ end
138
+
139
+ letters = word.scan(@language.scan_re)
140
+ word_size = letters.size
141
+
142
+ result = [0] * (word_size + 1)
143
+ right_stop = word_size - @right
144
+
145
+ updater = proc do |hash, str, pos|
146
+ if hash.has_key?(str)
147
+ hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
148
+ cc = cc.to_i
149
+ result[ii + pos] = cc if cc > result[ii + pos]
150
+ end
127
151
  end
128
- $stderr.print ": #{result.inspect}\n" if DEBUG
129
152
  end
130
- end
131
153
 
132
154
  # Walk the word
133
- (0..right_stop).each do |pos|
134
- rest_length = word_size - pos
135
- (1..rest_length).each do |length|
136
- substr = letters[pos, length].join('')
137
- updater[@language.hyphen, substr, pos]
138
- updater[@language.start, substr, pos] if pos.zero?
139
- updater[@language.stop, substr, pos] if (length == rest_length)
155
+ (0..right_stop).each do |pos|
156
+ rest_length = word_size - pos
157
+ (1..rest_length).each do |length|
158
+ substr = letters[pos, length].join("")
159
+ updater[@language.hyphen, substr, pos]
160
+ updater[@language.start, substr, pos] if pos.zero?
161
+ updater[@language.stop, substr, pos] if length == rest_length
162
+ end
140
163
  end
164
+
165
+ updater[@language.both, word, 0] if @language.both[word]
166
+
167
+ (0..@left).each { |i| result[i] = 0 }
168
+ ((-1 - @right)..-1).each { |i| result[i] = 0 }
169
+ @cache[word] = make_result_list(result)
141
170
  end
142
171
 
143
- updater[@language.both, word, 0] if @language.both[word]
172
+ if points.length > 1
173
+ offset = 0
174
+ result = []
144
175
 
145
- (0..@left).each { |i| result[i] = 0 }
146
- ((-1 - @right)..(-1)).each { |i| result[i] = 0 }
147
- @cache[word] = make_result_list(result)
176
+ points.each_with_index do |word, i|
177
+ word.each do |pos|
178
+ result << pos + offset
179
+ end
180
+
181
+ offset += words[i].length + 1
182
+ end
183
+
184
+ result
185
+ else
186
+ points.flatten
187
+ end
148
188
  end
149
189
 
150
190
  # Returns a visualization of the hyphenation points.
@@ -157,8 +197,15 @@ class Text::Hyphen
157
197
  #
158
198
  # Because hyphenation can be expensive, if the word has been visualised
159
199
  # previously, it will be returned from a per-instance cache.
160
- def visualise(word, hyphen = '-')
200
+ #
201
+ # #visualise supports phrase hyphenation:
202
+ #
203
+ # hyp.hyphenate("This useful library supports phrases and sentences.")
204
+ # #=> This use-ful li-brary sup-port-s phras-es and sen-tences.
205
+ def visualise(word, hyphen = "-")
206
+ return visualise_phrase(word, hyphen) if phrase?(word)
161
207
  return @vcache[word] if @vcache.has_key?(word)
208
+
162
209
  w = word.dup
163
210
  s = hyphen.size
164
211
  hyphenate(w).each_with_index do |pos, n|
@@ -168,7 +215,7 @@ class Text::Hyphen
168
215
  end
169
216
  @vcache[word] = w
170
217
  end
171
- alias visualize visualise
218
+ alias_method :visualize, :visualise
172
219
 
173
220
  # Clears the per-instance hyphenation and visualization caches.
174
221
  def clear_cache!
@@ -177,29 +224,33 @@ class Text::Hyphen
177
224
  end
178
225
 
179
226
  # This function will hyphenate a word so that the first point is at most
227
+ # +size+ characters.
180
228
  #
181
229
  # NOTE: if hyphen is set to a string, it will still be counted as one
182
230
  # character (since it represents a hyphen)
183
231
  #
184
- # +size+ characters.
185
- def hyphenate_to(word, size, hyphen = '-')
232
+ # #hyphenate_to does not support phrase hyphenation and will throw an
233
+ # exception if there are spaces.
234
+ def hyphenate_to(word, size, hyphen = "-")
235
+ raise ArgumentError, "#hyphenate_to does not support phrases" if phrase?(word)
236
+
186
237
  point = hyphenate(word).delete_if { |e| e >= size }.max
187
238
  if point.nil?
188
239
  [nil, word]
189
240
  else
190
- [word[0 ... point] + hyphen, word[point .. -1]]
241
+ [word[0...point] + hyphen, word[point..-1]]
191
242
  end
192
243
  end
193
244
 
194
245
  # Returns a string describing the structure of the patterns for the
195
246
  # language of this hyphenation object.
196
247
  def stats
197
- _b = @language.both.size
198
- _s = @language.start.size
199
- _e = @language.stop.size
200
- _h = @language.hyphen.size
201
- _x = @language.exceptions.size
202
- _T = _b + _s + _e + _h + _x
248
+ stats_both = @language.both.size
249
+ stats_start = @language.start.size
250
+ stats_end = @language.stop.size
251
+ stats_hyphens = @language.hyphen.size
252
+ stats_exceptions = @language.exceptions.size
253
+ stats_total = stats_both + stats_start + stats_end + stats_hyphens + stats_exceptions
203
254
 
204
255
  s = <<-EOS
205
256
 
@@ -210,25 +261,13 @@ The language '%s' contains %d total hyphenation patterns.
210
261
  % 6d patterns are normal patterns.
211
262
  % 6d patterns are exceptions.
212
263
 
213
- EOS
214
- s % [ @iso_language, _T, _s, _e, _b, _h, _x ]
264
+ EOS
265
+ s % [@iso_language, stats_total, stats_start, stats_end, stats_both, stats_hyphens, stats_exceptions]
215
266
  end
216
267
 
217
- def updateresult(hash, str, pos)
218
- if hash.has_key?(str)
219
- STDERR.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
220
- hash[str].scan(@language.scan_re).each_with_index do |c, i|
221
- c = c.to_i
222
- @result[i + pos] = c if c > @result[i + pos]
223
- end
224
- STDERR.puts ": #{@result}" if DEBUG
225
- end
226
- end
227
- private :updateresult
228
-
229
268
  def make_result_list(res)
230
269
  r = []
231
- res.each_with_index { |c, i| r << i * (c.to_i % 2) }
270
+ res.each_with_index { |c, i| r << i * (c.to_i % 2) }
232
271
  r.reject { |i| i.to_i == 0 }
233
272
  end
234
273
  private :make_result_list
@@ -251,17 +290,20 @@ EOS
251
290
  end
252
291
  private :load_language
253
292
 
254
- # Resolves a file for cleaner loading from a hyphenation loader file.
255
- def self.require_real_hyphenation_file(loader) # :nodoc:
256
- p = File.dirname(loader)
257
- f = File.basename(loader)
258
- v = if RUBY_VERSION < "1.9.1"
259
- "1.8"
260
- else
261
- "1.9"
262
- end
263
- require File.join(p, v, f)
293
+ def split_phrase(phrase)
294
+ phrase.split(/[[:space:]]+/)
295
+ end
296
+ private :split_phrase
297
+
298
+ def visualise_phrase(phrase, hyphen)
299
+ split_phrase(phrase).map { |word| visualise(word, hyphen) }.join(" ")
300
+ end
301
+ private :visualise_phrase
302
+
303
+ def phrase?(input)
304
+ /[^[:space:]][[:space:]][^[:space:]]/.match?(input)
264
305
  end
306
+ private :phrase?
265
307
  end
266
308
 
267
309
  # vim: syntax=ruby
data/lib/text-hyphen.rb CHANGED
@@ -1,2 +1,2 @@
1
1
  # -*- ruby encoding: utf-8 -*-
2
- require 'text/hyphen'
2
+ require "text/hyphen"
@@ -1,10 +1,10 @@
1
- # -*- encoding: latin1 -*-
1
+ # -*- encoding: iso-8859-1 -*-
2
2
 
3
3
  module TestTextHyphenData
4
4
  def self.bug_9807_data
5
5
  txt = "Dampfschifffahrtskapit�nsm�tzenhalterhersteller"
6
6
  pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
7
7
  viz = "Dampf-schiff-fahrts-ka-pi-t�ns-m�t-zen-hal-ter-her-stel-ler"
8
- [ txt, pts, viz ]
8
+ [txt, pts, viz]
9
9
  end
10
10
  end
@@ -5,6 +5,6 @@ module TestTextHyphenData
5
5
  txt = "Dampfschifffahrtskapitänsmützenhalterhersteller"
6
6
  pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
7
7
  viz = "Dampf-schiff-fahrts-ka-pi-täns-müt-zen-hal-ter-her-stel-ler"
8
- [ txt, pts, viz ]
8
+ [txt, pts, viz]
9
9
  end
10
10
  end
data/test/test_bugs.rb CHANGED
@@ -1,16 +1,17 @@
1
1
  # -*- encoding: utf-8 -*-
2
- require 'test/unit'
3
- require 'text-hyphen'
2
+
3
+ require "test/unit"
4
+ require "text-hyphen"
4
5
 
5
6
  # The behaviour of Text::Hyphen differs based on the version and the
6
- # encoding. Ruby 1.8 fails if the input is not latin1 and the hyphenation
7
- # patterns are latin1. Ruby 1.9 always expects UTF-8 patterns.
8
- data_version = if RUBY_VERSION < '1.9.1'
9
- 'latin1'
10
- else
11
- 'utf-8'
12
- end
13
- data_path = File.join(File.dirname(__FILE__), 'data')
7
+ # encoding. Ruby 1.8 fails if the input is not iso-8859-1 and the hyphenation
8
+ # patterns are iso-8859-1. Ruby 1.9 always expects UTF-8 patterns.
9
+ data_version = if RUBY_VERSION < "1.9.1"
10
+ "iso-8859-1"
11
+ else
12
+ "utf-8"
13
+ end
14
+ data_path = File.join(File.dirname(__FILE__), "data")
14
15
  load File.join(data_path, "bug_9807_#{data_version}.rb")
15
16
 
16
17
  class TestTextHyphenBugs < Test::Unit::TestCase
@@ -19,17 +20,17 @@ class TestTextHyphenBugs < Test::Unit::TestCase
19
20
  # http://rubyforge.org/tracker/index.php?func=detail&aid=28498&group_id=294&atid=1195
20
21
  txt, pts, viz = TestTextHyphenData.bug_9807_data
21
22
 
22
- de1 = Text::Hyphen.new(:language => 'de')
23
+ de1 = Text::Hyphen.new(:language => "de")
23
24
  assert_equal pts, de1.hyphenate(txt)
24
25
  assert_equal viz, de1.visualize(txt)
25
26
 
26
- de2 = Text::Hyphen.new(:language => 'de2')
27
+ de2 = Text::Hyphen.new(:language => "de2")
27
28
  assert_equal pts, de2.hyphenate(txt)
28
29
  assert_equal viz, de2.visualize(txt)
29
30
  end
30
31
 
31
32
  def test_rubyforge_28128
32
- en_us = Text::Hyphen.new(:language => 'en_us')
33
+ en_us = Text::Hyphen.new(:language => "en_us")
33
34
  assert_equal [], en_us.hyphenate("to")
34
35
  assert_equal "to", en_us.visualize("to")
35
36
  end