text-hyphen 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.standard.yml +5 -0
  3. data/Code-of-Conduct.md +73 -0
  4. data/Contributing.md +68 -0
  5. data/History.md +139 -0
  6. data/Licence.md +159 -0
  7. data/Manifest.txt +12 -5
  8. data/README.md +81 -0
  9. data/Rakefile +68 -19
  10. data/bin/ruby-hyphen +0 -0
  11. data/lib/text/hyphen/language/1.8/de1.rb +1307 -571
  12. data/lib/text/hyphen/language/1.8/en_us.rb +412 -453
  13. data/lib/text/hyphen/language/1.8/fr.rb +128 -334
  14. data/lib/text/hyphen/language/1.8/la.rb +1 -0
  15. data/lib/text/hyphen/language/1.8/lt.rb +115 -0
  16. data/lib/text/hyphen/language/1.8/pt.rb +2 -1
  17. data/lib/text/hyphen/language/1.8/sk.rb +277 -0
  18. data/lib/text/hyphen/language/1.9/ca.rb +2 -1
  19. data/lib/text/hyphen/language/1.9/cs.rb +2 -1
  20. data/lib/text/hyphen/language/1.9/da.rb +2 -1
  21. data/lib/text/hyphen/language/1.9/de1.rb +1382 -646
  22. data/lib/text/hyphen/language/1.9/de2.rb +110 -109
  23. data/lib/text/hyphen/language/1.9/en_uk.rb +2 -1
  24. data/lib/text/hyphen/language/1.9/en_us.rb +412 -454
  25. data/lib/text/hyphen/language/1.9/es.rb +2 -1
  26. data/lib/text/hyphen/language/1.9/et.rb +6 -5
  27. data/lib/text/hyphen/language/1.9/eu.rb +4 -3
  28. data/lib/text/hyphen/language/1.9/fi.rb +3 -2
  29. data/lib/text/hyphen/language/1.9/fr.rb +136 -343
  30. data/lib/text/hyphen/language/1.9/ga.rb +27 -26
  31. data/lib/text/hyphen/language/1.9/hr.rb +6 -5
  32. data/lib/text/hyphen/language/1.9/hsb.rb +3 -2
  33. data/lib/text/hyphen/language/1.9/hu1.rb +3 -2
  34. data/lib/text/hyphen/language/1.9/hu2.rb +5 -4
  35. data/lib/text/hyphen/language/1.9/ia.rb +2 -1
  36. data/lib/text/hyphen/language/1.9/id.rb +8 -7
  37. data/lib/text/hyphen/language/1.9/is.rb +2 -1
  38. data/lib/text/hyphen/language/1.9/it.rb +74 -74
  39. data/lib/text/hyphen/language/1.9/la.rb +54 -53
  40. data/lib/text/hyphen/language/1.9/lt.rb +116 -0
  41. data/lib/text/hyphen/language/1.9/mn.rb +7 -6
  42. data/lib/text/hyphen/language/1.9/nl.rb +2 -1
  43. data/lib/text/hyphen/language/1.9/no1.rb +3 -2
  44. data/lib/text/hyphen/language/1.9/no2.rb +3 -2
  45. data/lib/text/hyphen/language/1.9/pl.rb +2 -1
  46. data/lib/text/hyphen/language/1.9/pt.rb +3 -2
  47. data/lib/text/hyphen/language/1.9/ru.rb +2 -1
  48. data/lib/text/hyphen/language/1.9/sk.rb +280 -0
  49. data/lib/text/hyphen/language/1.9/sv.rb +4 -3
  50. data/lib/text/hyphen/language/cs.rb +1 -1
  51. data/lib/text/hyphen/language/de.rb +2 -1
  52. data/lib/text/hyphen/language/de1.rb +1 -1
  53. data/lib/text/hyphen/language/de2.rb +1 -1
  54. data/lib/text/hyphen/language/en_us.rb +1 -1
  55. data/lib/text/hyphen/language/eu.rb +1 -1
  56. data/lib/text/hyphen/language/fr.rb +1 -1
  57. data/lib/text/hyphen/language/hu.rb +1 -1
  58. data/lib/text/hyphen/language/hu1.rb +1 -1
  59. data/lib/text/hyphen/language/hu2.rb +1 -1
  60. data/lib/text/hyphen/language/is.rb +1 -1
  61. data/lib/text/hyphen/language/lt.rb +4 -0
  62. data/lib/text/hyphen/language/ms.rb +3 -3
  63. data/lib/text/hyphen/language/nl.rb +1 -1
  64. data/lib/text/hyphen/language/no.rb +1 -1
  65. data/lib/text/hyphen/language/sk.rb +4 -0
  66. data/lib/text/hyphen/language.rb +45 -45
  67. data/lib/text/hyphen.rb +139 -97
  68. data/lib/text-hyphen.rb +1 -1
  69. data/test/data/bug_9807_latin1.rb +2 -2
  70. data/test/data/bug_9807_utf-8.rb +1 -1
  71. data/test/test_bugs.rb +14 -13
  72. data/test/test_text_hyphen.rb +31 -21
  73. metadata +146 -96
  74. data/.autotest +0 -23
  75. data/.gemtest +0 -0
  76. data/History.rdoc +0 -99
  77. data/License.rdoc +0 -159
  78. data/README.rdoc +0 -95
  79. data/text-hyphen.gemspec +0 -51
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "HU2" => %W(HUN HU)
4
+ Text::Hyphen::Language.aliases_for "HU2" => %W[HUN HU]
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "IS" => %W(ICE ISL)
4
+ Text::Hyphen::Language.aliases_for "IS" => %W[ICE ISL]
@@ -0,0 +1,4 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
+ Text::Hyphen::Language.aliases_for "LT" => "LTU"
@@ -1,9 +1,9 @@
1
- require 'text/hyphen/language/id'
1
+ require "text/hyphen/language/id"
2
2
 
3
3
  unless defined? Text::Hyphen::Language::MS
4
4
  Text::Hyphen::Language::MS = Text::Hyphen::Language.new(Text::Hyphen::Language::ID) do |malay|
5
- malay.isocode = 'ms'
5
+ malay.isocode = "ms"
6
6
  end
7
7
 
8
- Text::Hyphen::Language.aliases_for "MS" => %W(MAY MSA)
8
+ Text::Hyphen::Language.aliases_for "MS" => %W[MAY MSA]
9
9
  end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
- Text::Hyphen::Language.aliases_for "NL" => %W(DUT NLD)
4
+ Text::Hyphen::Language.aliases_for "NL" => %W[DUT NLD]
@@ -1 +1 @@
1
- require 'text/hyphen/language/no1'
1
+ require "text/hyphen/language/no1"
@@ -0,0 +1,4 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Text::Hyphen.require_real_hyphenation_file(__FILE__)
4
+ Text::Hyphen::Language.aliases_for "SK" => "SVK"
@@ -4,32 +4,32 @@
4
4
  # patterns are defined as instances of this class—and only this class. This
5
5
  # is a deliberate "breaking" of Ruby's concept of duck-typing and is
6
6
  # intended to provide an indication that the patterns have been converted
7
- # from TeX encodings to other encodings (e.g., latin1 or UTF-8) that are
7
+ # from TeX encodings to other encodings (e.g., iso-8859-1 or UTF-8) that are
8
8
  # more suitable to general text manipulations.
9
9
  class Text::Hyphen::Language
10
- WORD_START_RE = %r{^\.} #:nodoc:
11
- WORD_END_RE = %r{\.$} #:nodoc:
12
- DIGIT_RE = %r{\d} #:nodoc:
13
- NONDIGIT_RE = %r{\D} #:nodoc:
14
- DASH_RE = %r{-} #:nodoc:
15
- EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} #:nodoc:
16
- EXCEPTION_DASH1_RE = %r{[^-]-} #:nodoc:
17
- EXCEPTION_NONUM_RE = %r{[^01]} #:nodoc:
18
- ZERO_INSERT_RE = %r{(\D)(?=\D)} #:nodoc:
19
- ZERO_START_RE = %r{^(?=\D)} #:nodoc:
20
-
21
- DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" #:nodoc:
22
- "latin1"
23
- else
24
- "utf-8"
25
- end
10
+ WORD_START_RE = %r{^\.} # :nodoc:
11
+ WORD_END_RE = %r{\.$} # :nodoc:
12
+ DIGIT_RE = %r{\d} # :nodoc:
13
+ NONDIGIT_RE = %r{\D} # :nodoc:
14
+ DASH_RE = %r{-} # :nodoc:
15
+ EXCEPTION_DASH0_RE = %r{[^-](?=[^-])} # :nodoc:
16
+ EXCEPTION_DASH1_RE = %r{[^-]-} # :nodoc:
17
+ EXCEPTION_NONUM_RE = %r{[^01]} # :nodoc:
18
+ ZERO_INSERT_RE = %r{(\D)(?=\D)} # :nodoc:
19
+ ZERO_START_RE = %r{^(?=\D)} # :nodoc:
20
+
21
+ DEFAULT_ENCODING = if RUBY_VERSION < "1.9.1" # :nodoc:
22
+ "iso-8859-1"
23
+ else
24
+ "utf-8"
25
+ end
26
26
 
27
27
  # The character scan regular expression to use.
28
- def scan_re #:nodoc:
29
- if RUBY_VERSION < '1.9.1'
28
+ def scan_re # :nodoc:
29
+ if RUBY_VERSION < "1.9.1"
30
30
  return %r{.}u if @encoding =~ /utf-?8/i
31
31
  end
32
- return %r{.}
32
+ %r{.}
33
33
  end
34
34
 
35
35
  # The encoding of the hyphenation definitions. The text to be compared
@@ -66,21 +66,21 @@ class Text::Hyphen::Language
66
66
  @pattern_text = pats.dup
67
67
 
68
68
  @patterns = {
69
- :both => {},
70
- :start => {},
71
- :stop => {},
69
+ :both => {},
70
+ :start => {},
71
+ :stop => {},
72
72
  :hyphen => {}
73
73
  }
74
74
 
75
- plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, '') }
75
+ plist = @pattern_text.split($/).map { |ln| ln.gsub(%r{%.*$}, "") }
76
76
  plist.each do |line|
77
77
  line.split.each do |word|
78
78
  next if word.empty?
79
79
 
80
80
  start = stop = false
81
81
 
82
- start = true if word.sub!(WORD_START_RE, '')
83
- stop = true if word.sub!(WORD_END_RE, '')
82
+ start = true if word.sub!(WORD_START_RE, "")
83
+ stop = true if word.sub!(WORD_END_RE, "")
84
84
 
85
85
  # Insert zeroes and start with some digit
86
86
  word.gsub!(ZERO_INSERT_RE) { "#{$1}0" }
@@ -88,17 +88,17 @@ class Text::Hyphen::Language
88
88
 
89
89
  # This assumes that the pattern lists are already in lowercase
90
90
  # form only.
91
- tag = word.gsub(DIGIT_RE, '')
92
- value = word.gsub(NONDIGIT_RE, '')
91
+ tag = word.gsub(DIGIT_RE, "")
92
+ value = word.gsub(NONDIGIT_RE, "")
93
93
 
94
- if start and stop
95
- set = :both
94
+ set = if start && stop
95
+ :both
96
96
  elsif start
97
- set = :start
97
+ :start
98
98
  elsif stop
99
- set = :stop
99
+ :stop
100
100
  else
101
- set = :hyphen
101
+ :hyphen
102
102
  end
103
103
 
104
104
  @patterns[set][tag] = value
@@ -116,10 +116,10 @@ class Text::Hyphen::Language
116
116
  @exceptions = {}
117
117
 
118
118
  @exception_text.split.each do |word|
119
- tag = word.gsub(DASH_RE,'')
120
- value = "0" + word.gsub(EXCEPTION_DASH0_RE, '0').gsub(EXCEPTION_DASH1_RE, '1')
121
- value.gsub!(EXCEPTION_NONUM_RE, '0')
122
- @exceptions[tag] = value.scan(self.scan_re).map { |c| c.to_i }
119
+ tag = word.gsub(DASH_RE, "")
120
+ value = "0" + word.gsub(EXCEPTION_DASH0_RE, "0").gsub(EXCEPTION_DASH1_RE, "1")
121
+ value.gsub!(EXCEPTION_NONUM_RE, "0")
122
+ @exceptions[tag] = value.scan(scan_re).map { |c| c.to_i }
123
123
  end
124
124
 
125
125
  true
@@ -142,16 +142,16 @@ class Text::Hyphen::Language
142
142
  # instance of Text::Hyphen::Language.
143
143
  def initialize(language = nil)
144
144
  if language.nil?
145
- self.encoding DEFAULT_ENCODING
146
- self.patterns ""
147
- self.exceptions ""
145
+ encoding DEFAULT_ENCODING
146
+ patterns ""
147
+ exceptions ""
148
148
  self.left = 2
149
149
  self.right = 2
150
150
  self.isocode = nil
151
- elsif language.kind_of? Text::Hyphen::Language
152
- self.encoding language.encoding
153
- self.patterns language.instance_variable_get(:@pattern_text)
154
- self.exceptions language.instance_variable_get(:@exception_text)
151
+ elsif language.is_a? Text::Hyphen::Language
152
+ encoding language.encoding
153
+ patterns language.instance_variable_get(:@pattern_text)
154
+ exceptions language.instance_variable_get(:@exception_text)
155
155
  self.left = language.left
156
156
  self.right = language.right
157
157
  self.isocode = language.isocode
@@ -171,7 +171,7 @@ class Text::Hyphen::Language
171
171
  end
172
172
  language = const_get(language)
173
173
 
174
- [ alias_names ].flatten.each do |alias_name|
174
+ [alias_names].flatten.each do |alias_name|
175
175
  next if const_defined? alias_name
176
176
  const_set(alias_name, language)
177
177
  end
data/lib/text/hyphen.rb CHANGED
@@ -7,10 +7,21 @@ end
7
7
  # hyphenation algorithm with pattern files. Each object is constructed with
8
8
  # a specific language's hyphenation patterns.
9
9
  class Text::Hyphen
10
- DEBUG = false
11
- VERSION = '1.4.1'
10
+ # Resolves a file for cleaner loading from a hyphenation loader file.
11
+ def self.require_real_hyphenation_file(loader) # :nodoc:
12
+ p = File.dirname(loader)
13
+ f = File.basename(loader)
14
+ v = if RUBY_VERSION < "1.9.1"
15
+ "1.8"
16
+ else
17
+ "1.9"
18
+ end
19
+ require File.join(p, v, f)
20
+ end
12
21
 
13
- DEFAULT_MIN_LEFT = 2
22
+ VERSION = "1.5.0"
23
+
24
+ DEFAULT_MIN_LEFT = 2
14
25
  DEFAULT_MIN_RIGHT = 2
15
26
 
16
27
  # No fewer than this number of letters will show up to the left of the
@@ -26,31 +37,31 @@ class Text::Hyphen
26
37
  # two or three character ISO 639 code, with the two character form being
27
38
  # the canonical resource name. This will load the language hyphenation
28
39
  # definitions from text/hyphen/language/&lt;code&gt; as a Ruby class. The
29
- # resource 'text/hyphen/language/en_us' defines the language class
40
+ # resource "text/hyphen/language/en_us" defines the language class
30
41
  # Text::Hyphen::Language::EN_US. It also defines the secondary forms
31
42
  # Text::Hyphen::Language::EN and Text::Hyphen::Language::ENG_US.
32
43
  #
33
44
  # Minimal transformations will be performed on the language code provided,
34
- # such that any dashes are converted to underscores (e.g., 'en-us' becomes
35
- # 'en_us') and all characters are regularised. Resource names will be
36
- # downcased and class names will be converted to uppercase (e.g., 'Pt' for
37
- # the Portuguese language becomes 'pt' and 'PT', respectively).
45
+ # such that any dashes are converted to underscores (e.g., "en-us" becomes
46
+ # "en_us") and all characters are regularised. Resource names will be
47
+ # downcased and class names will be converted to uppercase (e.g., "Pt" for
48
+ # the Portuguese language becomes "pt" and "PT", respectively).
38
49
  #
39
50
  # The language may also be specified as an instance of
40
51
  # Text::Hyphen::Language.
41
- attr_accessor :language
52
+ #
53
+ # :attr_accessor: language
54
+ attr_reader :language
42
55
 
43
- undef :language=
44
- def language=(lang) #:nodoc:
45
- require 'text/hyphen/language' unless defined?(Text::Hyphen::Language)
46
- if lang.kind_of? Text::Hyphen::Language
56
+ def language=(lang) # :nodoc:
57
+ require "text/hyphen/language" unless defined?(Text::Hyphen::Language)
58
+ if lang.is_a? Text::Hyphen::Language
47
59
  @iso_language = lang.to_s.split(%r{::}o)[-1].downcase
48
- @language = lang
60
+ @language = lang
49
61
  else
50
62
  @iso_language = lang.downcase
51
63
  load_language
52
64
  end
53
- @iso_language
54
65
  end
55
66
 
56
67
  # Returns the language's ISO 639 ID, e.g., "en_us" or "pt".
@@ -70,23 +81,22 @@ class Text::Hyphen
70
81
  # methods in an initialization block. The following initializations are
71
82
  # all equivalent:
72
83
  #
73
- # hyp = Text::Hyphenate.new(:language => 'en_us')
74
- # hyp = Text::Hyphenate.new(language: 'en_us') # under Ruby 1.9
75
- # hyp = Text::Hyphenate.new { |h| h.language = 'en_us' }
84
+ # hyp = Text::Hyphenate.new(language: "en_us")
85
+ # hyp = Text::Hyphenate.new { |h| h.language = "en_us" }
76
86
  def initialize(options = {}) # :yields self:
77
87
  @iso_language = options[:language]
78
- @left = options[:left]
79
- @right = options[:right]
80
- @language = nil
88
+ @left = options[:left]
89
+ @right = options[:right]
90
+ @language = nil
81
91
 
82
- @cache = {}
83
- @vcache = {}
92
+ @cache = {}
93
+ @vcache = {}
84
94
 
85
- @hyphen = {}
95
+ @hyphen = {}
86
96
  @begin_hyphen = {}
87
- @end_hyphen = {}
88
- @both_hyphen = {}
89
- @exception = {}
97
+ @end_hyphen = {}
98
+ @both_hyphen = {}
99
+ @exception = {}
90
100
 
91
101
  @first_load = true
92
102
  yield self if block_given?
@@ -94,57 +104,87 @@ class Text::Hyphen
94
104
 
95
105
  load_language
96
106
 
97
- @left ||= DEFAULT_MIN_LEFT
107
+ @left ||= DEFAULT_MIN_LEFT
98
108
  @right ||= DEFAULT_MIN_RIGHT
99
109
  end
100
110
 
101
111
  # Returns an array of character positions where a word can be hyphenated.
102
112
  #
103
- # hyp.hyphenate('representation') #=> [3, 5, 8 10]
113
+ # hyp.hyphenate("representation") #=> [3, 5, 8 10]
104
114
  #
105
115
  # Because hyphenation can be expensive, if the word has been hyphenated
106
116
  # previously, it will be returned from a per-instance cache.
117
+ #
118
+ # #hyphenate supports phrase hyphenation:
119
+ #
120
+ # hyp.hyphenate("This useful library supports phrases and sentences.")
121
+ # #=> [8, 14, 23, 27, 34, 44]
122
+ #
123
+ # When phrases are hyphenated, each word is processed individually and the
124
+ # result is returned as a single continuous list of hyphenation points.
107
125
  def hyphenate(word)
108
- word = word.downcase
109
- $stderr.puts "Hyphenating #{word}" if DEBUG
110
- return @cache[word] if @cache.has_key?(word)
111
- res = @language.exceptions[word]
112
- return @cache[word] = make_result_list(res) if res
113
-
114
- letters = word.scan(@language.scan_re)
115
- $stderr.puts letters.inspect if DEBUG
116
- word_size = letters.size
117
-
118
- result = [0] * (word_size + 1)
119
- right_stop = word_size - @right
120
-
121
- updater = Proc.new do |hash, str, pos|
122
- if hash.has_key?(str)
123
- $stderr.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
124
- hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
125
- cc = cc.to_i
126
- result[ii + pos] = cc if cc > result[ii + pos]
126
+ words = if phrase?(word)
127
+ word.downcase.split(/[[:space:]]/)
128
+ else
129
+ [word.downcase]
130
+ end
131
+
132
+ points = words.map do |word|
133
+ next @cache[word] if @cache.has_key?(word)
134
+
135
+ if (exception = @language.exceptions[word])
136
+ next @cache[word] = make_result_list(exception)
137
+ end
138
+
139
+ letters = word.scan(@language.scan_re)
140
+ word_size = letters.size
141
+
142
+ result = [0] * (word_size + 1)
143
+ right_stop = word_size - @right
144
+
145
+ updater = proc do |hash, str, pos|
146
+ if hash.has_key?(str)
147
+ hash[str].scan(@language.scan_re).each_with_index do |cc, ii|
148
+ cc = cc.to_i
149
+ result[ii + pos] = cc if cc > result[ii + pos]
150
+ end
127
151
  end
128
- $stderr.print ": #{result.inspect}\n" if DEBUG
129
152
  end
130
- end
131
153
 
132
154
  # Walk the word
133
- (0..right_stop).each do |pos|
134
- rest_length = word_size - pos
135
- (1..rest_length).each do |length|
136
- substr = letters[pos, length].join('')
137
- updater[@language.hyphen, substr, pos]
138
- updater[@language.start, substr, pos] if pos.zero?
139
- updater[@language.stop, substr, pos] if (length == rest_length)
155
+ (0..right_stop).each do |pos|
156
+ rest_length = word_size - pos
157
+ (1..rest_length).each do |length|
158
+ substr = letters[pos, length].join("")
159
+ updater[@language.hyphen, substr, pos]
160
+ updater[@language.start, substr, pos] if pos.zero?
161
+ updater[@language.stop, substr, pos] if length == rest_length
162
+ end
140
163
  end
164
+
165
+ updater[@language.both, word, 0] if @language.both[word]
166
+
167
+ (0..@left).each { |i| result[i] = 0 }
168
+ ((-1 - @right)..-1).each { |i| result[i] = 0 }
169
+ @cache[word] = make_result_list(result)
141
170
  end
142
171
 
143
- updater[@language.both, word, 0] if @language.both[word]
172
+ if points.length > 1
173
+ offset = 0
174
+ result = []
144
175
 
145
- (0..@left).each { |i| result[i] = 0 }
146
- ((-1 - @right)..(-1)).each { |i| result[i] = 0 }
147
- @cache[word] = make_result_list(result)
176
+ points.each_with_index do |word, i|
177
+ word.each do |pos|
178
+ result << pos + offset
179
+ end
180
+
181
+ offset += words[i].length + 1
182
+ end
183
+
184
+ result
185
+ else
186
+ points.flatten
187
+ end
148
188
  end
149
189
 
150
190
  # Returns a visualization of the hyphenation points.
@@ -157,8 +197,15 @@ class Text::Hyphen
157
197
  #
158
198
  # Because hyphenation can be expensive, if the word has been visualised
159
199
  # previously, it will be returned from a per-instance cache.
160
- def visualise(word, hyphen = '-')
200
+ #
201
+ # #visualise supports phrase hyphenation:
202
+ #
203
+ # hyp.hyphenate("This useful library supports phrases and sentences.")
204
+ # #=> This use-ful li-brary sup-port-s phras-es and sen-tences.
205
+ def visualise(word, hyphen = "-")
206
+ return visualise_phrase(word, hyphen) if phrase?(word)
161
207
  return @vcache[word] if @vcache.has_key?(word)
208
+
162
209
  w = word.dup
163
210
  s = hyphen.size
164
211
  hyphenate(w).each_with_index do |pos, n|
@@ -168,7 +215,7 @@ class Text::Hyphen
168
215
  end
169
216
  @vcache[word] = w
170
217
  end
171
- alias visualize visualise
218
+ alias_method :visualize, :visualise
172
219
 
173
220
  # Clears the per-instance hyphenation and visualization caches.
174
221
  def clear_cache!
@@ -177,29 +224,33 @@ class Text::Hyphen
177
224
  end
178
225
 
179
226
  # This function will hyphenate a word so that the first point is at most
227
+ # +size+ characters.
180
228
  #
181
229
  # NOTE: if hyphen is set to a string, it will still be counted as one
182
230
  # character (since it represents a hyphen)
183
231
  #
184
- # +size+ characters.
185
- def hyphenate_to(word, size, hyphen = '-')
232
+ # #hyphenate_to does not support phrase hyphenation and will throw an
233
+ # exception if there are spaces.
234
+ def hyphenate_to(word, size, hyphen = "-")
235
+ raise ArgumentError, "#hyphenate_to does not support phrases" if phrase?(word)
236
+
186
237
  point = hyphenate(word).delete_if { |e| e >= size }.max
187
238
  if point.nil?
188
239
  [nil, word]
189
240
  else
190
- [word[0 ... point] + hyphen, word[point .. -1]]
241
+ [word[0...point] + hyphen, word[point..-1]]
191
242
  end
192
243
  end
193
244
 
194
245
  # Returns a string describing the structure of the patterns for the
195
246
  # language of this hyphenation object.
196
247
  def stats
197
- _b = @language.both.size
198
- _s = @language.start.size
199
- _e = @language.stop.size
200
- _h = @language.hyphen.size
201
- _x = @language.exceptions.size
202
- _T = _b + _s + _e + _h + _x
248
+ stats_both = @language.both.size
249
+ stats_start = @language.start.size
250
+ stats_end = @language.stop.size
251
+ stats_hyphens = @language.hyphen.size
252
+ stats_exceptions = @language.exceptions.size
253
+ stats_total = stats_both + stats_start + stats_end + stats_hyphens + stats_exceptions
203
254
 
204
255
  s = <<-EOS
205
256
 
@@ -210,25 +261,13 @@ The language '%s' contains %d total hyphenation patterns.
210
261
  % 6d patterns are normal patterns.
211
262
  % 6d patterns are exceptions.
212
263
 
213
- EOS
214
- s % [ @iso_language, _T, _s, _e, _b, _h, _x ]
264
+ EOS
265
+ s % [@iso_language, stats_total, stats_start, stats_end, stats_both, stats_hyphens, stats_exceptions]
215
266
  end
216
267
 
217
- def updateresult(hash, str, pos)
218
- if hash.has_key?(str)
219
- STDERR.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
220
- hash[str].scan(@language.scan_re).each_with_index do |c, i|
221
- c = c.to_i
222
- @result[i + pos] = c if c > @result[i + pos]
223
- end
224
- STDERR.puts ": #{@result}" if DEBUG
225
- end
226
- end
227
- private :updateresult
228
-
229
268
  def make_result_list(res)
230
269
  r = []
231
- res.each_with_index { |c, i| r << i * (c.to_i % 2) }
270
+ res.each_with_index { |c, i| r << i * (c.to_i % 2) }
232
271
  r.reject { |i| i.to_i == 0 }
233
272
  end
234
273
  private :make_result_list
@@ -251,17 +290,20 @@ EOS
251
290
  end
252
291
  private :load_language
253
292
 
254
- # Resolves a file for cleaner loading from a hyphenation loader file.
255
- def self.require_real_hyphenation_file(loader) # :nodoc:
256
- p = File.dirname(loader)
257
- f = File.basename(loader)
258
- v = if RUBY_VERSION < "1.9.1"
259
- "1.8"
260
- else
261
- "1.9"
262
- end
263
- require File.join(p, v, f)
293
+ def split_phrase(phrase)
294
+ phrase.split(/[[:space:]]+/)
295
+ end
296
+ private :split_phrase
297
+
298
+ def visualise_phrase(phrase, hyphen)
299
+ split_phrase(phrase).map { |word| visualise(word, hyphen) }.join(" ")
300
+ end
301
+ private :visualise_phrase
302
+
303
+ def phrase?(input)
304
+ /[^[:space:]][[:space:]][^[:space:]]/.match?(input)
264
305
  end
306
+ private :phrase?
265
307
  end
266
308
 
267
309
  # vim: syntax=ruby
data/lib/text-hyphen.rb CHANGED
@@ -1,2 +1,2 @@
1
1
  # -*- ruby encoding: utf-8 -*-
2
- require 'text/hyphen'
2
+ require "text/hyphen"
@@ -1,10 +1,10 @@
1
- # -*- encoding: latin1 -*-
1
+ # -*- encoding: iso-8859-1 -*-
2
2
 
3
3
  module TestTextHyphenData
4
4
  def self.bug_9807_data
5
5
  txt = "Dampfschifffahrtskapit�nsm�tzenhalterhersteller"
6
6
  pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
7
7
  viz = "Dampf-schiff-fahrts-ka-pi-t�ns-m�t-zen-hal-ter-her-stel-ler"
8
- [ txt, pts, viz ]
8
+ [txt, pts, viz]
9
9
  end
10
10
  end
@@ -5,6 +5,6 @@ module TestTextHyphenData
5
5
  txt = "Dampfschifffahrtskapitänsmützenhalterhersteller"
6
6
  pts = [5, 11, 17, 19, 21, 25, 28, 31, 34, 37, 40, 44]
7
7
  viz = "Dampf-schiff-fahrts-ka-pi-täns-müt-zen-hal-ter-her-stel-ler"
8
- [ txt, pts, viz ]
8
+ [txt, pts, viz]
9
9
  end
10
10
  end
data/test/test_bugs.rb CHANGED
@@ -1,16 +1,17 @@
1
1
  # -*- encoding: utf-8 -*-
2
- require 'test/unit'
3
- require 'text-hyphen'
2
+
3
+ require "test/unit"
4
+ require "text-hyphen"
4
5
 
5
6
  # The behaviour of Text::Hyphen differs based on the version and the
6
- # encoding. Ruby 1.8 fails if the input is not latin1 and the hyphenation
7
- # patterns are latin1. Ruby 1.9 always expects UTF-8 patterns.
8
- data_version = if RUBY_VERSION < '1.9.1'
9
- 'latin1'
10
- else
11
- 'utf-8'
12
- end
13
- data_path = File.join(File.dirname(__FILE__), 'data')
7
+ # encoding. Ruby 1.8 fails if the input is not iso-8859-1 and the hyphenation
8
+ # patterns are iso-8859-1. Ruby 1.9 always expects UTF-8 patterns.
9
+ data_version = if RUBY_VERSION < "1.9.1"
10
+ "iso-8859-1"
11
+ else
12
+ "utf-8"
13
+ end
14
+ data_path = File.join(File.dirname(__FILE__), "data")
14
15
  load File.join(data_path, "bug_9807_#{data_version}.rb")
15
16
 
16
17
  class TestTextHyphenBugs < Test::Unit::TestCase
@@ -19,17 +20,17 @@ class TestTextHyphenBugs < Test::Unit::TestCase
19
20
  # http://rubyforge.org/tracker/index.php?func=detail&aid=28498&group_id=294&atid=1195
20
21
  txt, pts, viz = TestTextHyphenData.bug_9807_data
21
22
 
22
- de1 = Text::Hyphen.new(:language => 'de')
23
+ de1 = Text::Hyphen.new(:language => "de")
23
24
  assert_equal pts, de1.hyphenate(txt)
24
25
  assert_equal viz, de1.visualize(txt)
25
26
 
26
- de2 = Text::Hyphen.new(:language => 'de2')
27
+ de2 = Text::Hyphen.new(:language => "de2")
27
28
  assert_equal pts, de2.hyphenate(txt)
28
29
  assert_equal viz, de2.visualize(txt)
29
30
  end
30
31
 
31
32
  def test_rubyforge_28128
32
- en_us = Text::Hyphen.new(:language => 'en_us')
33
+ en_us = Text::Hyphen.new(:language => "en_us")
33
34
  assert_equal [], en_us.hyphenate("to")
34
35
  assert_equal "to", en_us.visualize("to")
35
36
  end