text-hyphen 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +4 -0
- data/Changelog +4 -0
- data/INSTALL +6 -0
- data/LICENCE +47 -0
- data/README +56 -0
- data/Rakefile +116 -0
- data/bin/hyphen +107 -0
- data/lib/text/hyphen.rb +289 -0
- data/lib/text/hyphen/language.rb +112 -0
- data/lib/text/hyphen/language/ca.rb +174 -0
- data/lib/text/hyphen/language/cs.rb +363 -0
- data/lib/text/hyphen/language/da.rb +118 -0
- data/lib/text/hyphen/language/de1.rb +723 -0
- data/lib/text/hyphen/language/de2.rb +685 -0
- data/lib/text/hyphen/language/en_uk.rb +791 -0
- data/lib/text/hyphen/language/en_us.rb +493 -0
- data/lib/text/hyphen/language/es.rb +289 -0
- data/lib/text/hyphen/language/et.rb +337 -0
- data/lib/text/hyphen/language/eu.rb +115 -0
- data/lib/text/hyphen/language/fi.rb +113 -0
- data/lib/text/hyphen/language/fr.rb +392 -0
- data/lib/text/hyphen/language/ga.rb +608 -0
- data/lib/text/hyphen/language/hr.rb +124 -0
- data/lib/text/hyphen/language/hsb.rb +180 -0
- data/lib/text/hyphen/language/hu1.rb +385 -0
- data/lib/text/hyphen/language/hu2.rb +1283 -0
- data/lib/text/hyphen/language/ia.rb +73 -0
- data/lib/text/hyphen/language/id.rb +97 -0
- data/lib/text/hyphen/language/is.rb +390 -0
- data/lib/text/hyphen/language/it.rb +135 -0
- data/lib/text/hyphen/language/la.rb +134 -0
- data/lib/text/hyphen/language/mn.rb +103 -0
- data/lib/text/hyphen/language/nl.rb +1253 -0
- data/lib/text/hyphen/language/no1.rb +303 -0
- data/lib/text/hyphen/language/no2.rb +138 -0
- data/lib/text/hyphen/language/pl.rb +480 -0
- data/lib/text/hyphen/language/pt.rb +56 -0
- data/lib/text/hyphen/language/sv.rb +449 -0
- data/tests/tc_text_hyphen.rb +62 -0
- metadata +90 -0
data/ChangeLog
ADDED
data/Changelog
ADDED
data/INSTALL
ADDED
data/LICENCE
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
Text::Hyphen is copyright (c) 2004 Austin Ziegler
|
2
|
+
|
3
|
+
Licensing for Text::Hyphen is unfortunately complex because of the various
|
4
|
+
copyrights and licences of the source hyphenation files. Some of these files
|
5
|
+
are available only under the TeX licence and others are available only under
|
6
|
+
the GNU GPL while others are public domain. Each language file has these
|
7
|
+
licences embedded within the file. Please consult each file's licence to
|
8
|
+
ensure that it is compatible with your application.
|
9
|
+
|
10
|
+
The copyright on the Text::Hyphen application/library and the Ruby
|
11
|
+
translations of hyphenation files belongs to Austin Ziegler. All other
|
12
|
+
copyrights on original versions still stand; Text::Hyphen is a derivative
|
13
|
+
work of these and other projects.
|
14
|
+
|
15
|
+
Application and Compilation Licences
|
16
|
+
------------------------------------
|
17
|
+
Text::Hyphen, the application/library is licensed under the same terms as
|
18
|
+
Ruby. Note that this specifically refers to the contents of bin/hyphen,
|
19
|
+
lib/text/hyphen.rb, and lib/text/hyphen/language.rb.
|
20
|
+
|
21
|
+
Individual language hyphenation files are NOT licensed under these terms, but
|
22
|
+
under the following MIT-style licence and the original hyphenation pattern
|
23
|
+
licenses. The copyright for the original TeX hyphenation files is held by the
|
24
|
+
original authors; any mistakes in conversion of these files to Ruby is
|
25
|
+
attributable to the contributors to the Text::Hyphen package only.
|
26
|
+
|
27
|
+
The compilation package Text::Hyphen is licensed under the same terms as Ruby.
|
28
|
+
|
29
|
+
Blanket Language Hyphenation File Licence
|
30
|
+
-----------------------------------------
|
31
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
32
|
+
this software and associated documentation files (the "Software"), to deal in
|
33
|
+
the Software without restriction, including without limitation the rights to
|
34
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
35
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
36
|
+
so, subject to the following conditions:
|
37
|
+
|
38
|
+
The above copyright notice and this permission notice shall be included in all
|
39
|
+
copies or substantial portions of the Software.
|
40
|
+
|
41
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
42
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
43
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
44
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
45
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
46
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
47
|
+
SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
Text::Hyphen README
|
2
|
+
===================
|
3
|
+
|
4
|
+
Text::Hyphen will properly hyphenate various words according to the rules of
|
5
|
+
the language the word is written in. The algorithm is based on that of the TeX
|
6
|
+
typesetting system by Donald E. Knuth. This is originally based on the Perl
|
7
|
+
implementation of TeX::Hyphen[1] and the Ruby port TeX::Hyphen[2]. The
|
8
|
+
language hyphenation pattern files are based on the sources available from
|
9
|
+
CTAN[3] as of 2004.12.19 and have been translated by Austin Ziegler.
|
10
|
+
|
11
|
+
This release is 1.0, the initial release of Text::Hyphen, representing a
|
12
|
+
significant improvement over its predecessor, TeX::Hyphen.
|
13
|
+
|
14
|
+
require 'text/hyphen'
|
15
|
+
hh = Text::Hyphen.new(:language => 'en_us', :left => 2, :right => 2)
|
16
|
+
# Defaults to the above
|
17
|
+
hh = TeX::Hyphen.new
|
18
|
+
|
19
|
+
word = "representation"
|
20
|
+
points = hyp.hyphenate(word) #=> [3, 5, 8, 10]
|
21
|
+
puts hyp.visualize(word) #=> rep-re-sen-ta-tion
|
22
|
+
|
23
|
+
Text::Hyphen is truly multilingual in nature[4]. As an example, consider the
|
24
|
+
difference between the following:
|
25
|
+
|
26
|
+
require 'text/hyphen'
|
27
|
+
# Using left and right minimum values of 0 ensures that you will see all
|
28
|
+
# possible hyphenation points, not just those that meet the minimum
|
29
|
+
# width requirements.
|
30
|
+
en = Text::Hyphen.new(:left => 0, :right => 0)
|
31
|
+
fr = Text::Hyphen.new(:language = "fr", :left => 0, :right => 0)
|
32
|
+
|
33
|
+
puts en.visualise("organiser") #=> or-gan-iser
|
34
|
+
puts fr.visualise("organiser") #=> or-ga-ni-ser
|
35
|
+
|
36
|
+
As you can see, the hyphenation is distinct between the two hyphenators.
|
37
|
+
Additional improvements over TeX::Hyphen include thread safety (except for
|
38
|
+
debug control) and support for UTF-8.
|
39
|
+
|
40
|
+
It is very important to read the LICENCE file and each language file desired,
|
41
|
+
as some languages may be held under a more strict licence than that granted by
|
42
|
+
LICENCE.
|
43
|
+
|
44
|
+
Copyright
|
45
|
+
=========
|
46
|
+
# Copyright 2004 Austin Ziegler <text-hyphen@halostatue.ca>
|
47
|
+
# See the LICENCE file for more information.
|
48
|
+
|
49
|
+
[1] <http://search.cpan.org/author/JANPAZ/TeX-Hyphen-0.140/lib/TeX/Hyphen.pm>
|
50
|
+
Maintained by Jan Pazdziora.
|
51
|
+
[2] Available at <http://rubyforge.org/projects/text-format>.
|
52
|
+
[3] <http://www.ctan.org>
|
53
|
+
[4] There are some bugs and design decisions in the original Perl
|
54
|
+
implementation of TeX::Hyphen that make it unsuitable for most
|
55
|
+
multilingual implementations that carried over to the Ruby port of
|
56
|
+
TeX::Hyphen.
|
data/Rakefile
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
#! /usr/bin/env rake
|
2
|
+
$LOAD_PATH.unshift('lib')
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'rake/gempackagetask'
|
6
|
+
require 'text/hyphen'
|
7
|
+
require 'archive/tar/minitar'
|
8
|
+
require 'zlib'
|
9
|
+
|
10
|
+
DISTDIR = "text-hyphen-#{Text::Hyphen::VERSION}"
|
11
|
+
TARDIST = "../#{DISTDIR}.tar.gz"
|
12
|
+
|
13
|
+
DATE_RE = %r<(\d{4})[./-]?(\d{2})[./-]?(\d{2})(?:[\sT]?(\d{2})[:.]?(\d{2})[:.]?(\d{2})?)?>
|
14
|
+
|
15
|
+
if ENV['RELEASE_DATE']
|
16
|
+
year, month, day, hour, minute, second = DATE_RE.match(ENV['RELEASE_DATE']).captures
|
17
|
+
year ||= 0
|
18
|
+
month ||= 0
|
19
|
+
day ||= 0
|
20
|
+
hour ||= 0
|
21
|
+
minute ||= 0
|
22
|
+
second ||= 0
|
23
|
+
ReleaseDate = Time.mktime(year, month, day, hour, minute, second)
|
24
|
+
else
|
25
|
+
ReleaseDate = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
task :test do |t|
|
29
|
+
require 'test/unit/testsuite'
|
30
|
+
require 'test/unit/ui/console/testrunner'
|
31
|
+
|
32
|
+
runner = Test::Unit::UI::Console::TestRunner
|
33
|
+
|
34
|
+
$LOAD_PATH.unshift('tests')
|
35
|
+
$stderr.puts "Checking for test cases:" if t.verbose
|
36
|
+
Dir['tests/tc_*.rb'].each do |testcase|
|
37
|
+
$stderr.puts "\t#{testcase}" if t.verbose
|
38
|
+
load testcase
|
39
|
+
end
|
40
|
+
|
41
|
+
suite = Test::Unit::TestSuite.new("Text::Hyphen")
|
42
|
+
|
43
|
+
ObjectSpace.each_object(Class) do |testcase|
|
44
|
+
suite << testcase.suite if testcase < Test::Unit::TestCase
|
45
|
+
end
|
46
|
+
|
47
|
+
runner.run(suite)
|
48
|
+
end
|
49
|
+
|
50
|
+
spec = eval(File.read("text-hyphen.gemspec"))
|
51
|
+
spec.version = Text::Hyphen::VERSION
|
52
|
+
desc "Build the RubyGem for Text::Hyphen"
|
53
|
+
task :gem => [ :test ]
|
54
|
+
Rake::GemPackageTask.new(spec) do |g|
|
55
|
+
g.need_tar = false
|
56
|
+
g.need_zip = false
|
57
|
+
g.package_dir = ".."
|
58
|
+
end
|
59
|
+
|
60
|
+
desc "Build a Text::Hyphen .tar.gz distribution."
|
61
|
+
task :tar => [ TARDIST ]
|
62
|
+
file TARDIST => [ :test ] do |t|
|
63
|
+
current = File.basename(Dir.pwd)
|
64
|
+
Dir.chdir("..") do
|
65
|
+
begin
|
66
|
+
files = Dir["#{current}/**/*"].select { |dd| dd !~ %r{(?:/CVS/?|~$)} }
|
67
|
+
files.map! do |dd|
|
68
|
+
ddnew = dd.gsub(/^#{current}/, DISTDIR)
|
69
|
+
mtime = ReleaseDate || File.stat(dd).mtime
|
70
|
+
if File.directory?(dd)
|
71
|
+
{ :name => ddnew, :mode => 0755, :dir => true, :mtime => mtime }
|
72
|
+
else
|
73
|
+
if dd =~ %r{bin/}
|
74
|
+
mode = 0755
|
75
|
+
else
|
76
|
+
mode = 0644
|
77
|
+
end
|
78
|
+
data = File.read(dd)
|
79
|
+
{ :name => ddnew, :mode => mode, :data => data, :size => data.size,
|
80
|
+
:mtime => mtime }
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
ff = File.open(t.name.gsub(%r{^\.\./}o, ''), "wb")
|
85
|
+
gz = Zlib::GzipWriter.new(ff)
|
86
|
+
tw = Archive::Tar::Minitar::Writer.new(gz)
|
87
|
+
|
88
|
+
files.each do |entry|
|
89
|
+
if entry[:dir]
|
90
|
+
tw.mkdir(entry[:name], entry)
|
91
|
+
else
|
92
|
+
tw.add_file_simple(entry[:name], entry) { |os| os.write(entry[:data]) }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
ensure
|
96
|
+
tw.close if tw
|
97
|
+
gz.close if gz
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
task TARDIST => [ :test ]
|
102
|
+
|
103
|
+
def sign(file)
|
104
|
+
system %("C:/Program Files/Windows Privacy Tools/GnuPG/gpg.exe" -ba #{file}).gsub(%r{/}) { "\\" }
|
105
|
+
raise "Error signing with GPG." unless File.exists?("#{file}.asc")
|
106
|
+
end
|
107
|
+
|
108
|
+
task :signtar => [ :tar ] do
|
109
|
+
sign TARDIST
|
110
|
+
end
|
111
|
+
task :signgem => [ :gem ] do
|
112
|
+
sign "../#{DISTDIR}.gem"
|
113
|
+
end
|
114
|
+
|
115
|
+
desc "Build everything."
|
116
|
+
task :default => [ :signtar, :signgem ]
|
data/bin/hyphen
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Text::Hyphen
|
3
|
+
# Copyright 2003 - 2004, Martin DeMello and Austin Ziegler
|
4
|
+
#
|
5
|
+
# Licensed under the same terms as Ruby.
|
6
|
+
#
|
7
|
+
# $Id: hyphen,v 1.2 2004/12/20 22:43:03 austin Exp $
|
8
|
+
#++
|
9
|
+
|
10
|
+
require 'optparse'
|
11
|
+
require 'ostruct'
|
12
|
+
|
13
|
+
begin
|
14
|
+
require 'text/hyphen'
|
15
|
+
rescue LoadError
|
16
|
+
require 'rubygems'
|
17
|
+
require 'text/hyphen'
|
18
|
+
end
|
19
|
+
|
20
|
+
options = OpenStruct.new
|
21
|
+
options.action = :visualise
|
22
|
+
ARGV.options do |opt|
|
23
|
+
opt.banner = "Usage: #{File.basename($0)} [options] [mode] word+"
|
24
|
+
opt.separator ""
|
25
|
+
opt.separator "Modes"
|
26
|
+
opt.on('-V', '--visualise', 'Visualises the hyphenation of the word.', 'Default action.') { |mode|
|
27
|
+
options.action = :visualise
|
28
|
+
}
|
29
|
+
opt.on('-P', '--points', 'Shows the letters on which a word will', 'be hyphenated.') { |mode|
|
30
|
+
options.action = :hyphenate
|
31
|
+
}
|
32
|
+
opt.on('-H', '--hyphenate-to SIZE', Numeric, 'Hyphenates the word so that the first', 'point is at least SIZE letters.') { |size|
|
33
|
+
options.action = :hyphenate_to
|
34
|
+
options.size = size.to_i
|
35
|
+
}
|
36
|
+
opt.on("-S", "--stats", 'Shows the hyphenation statistics for the', 'current language pattern dictionary.') { |mode|
|
37
|
+
options.action = :stats
|
38
|
+
}
|
39
|
+
|
40
|
+
opt.separator ""
|
41
|
+
opt.separator "Options"
|
42
|
+
opt.on('-L', '--left SIZE', Integer, 'Sets the minimum number of letters on', 'the left side of the word.') { |left|
|
43
|
+
options.left = left.to_i
|
44
|
+
}
|
45
|
+
opt.on('-R', '--right SIZE', Integer, 'Sets the minimum number of letters on', 'the right side of the word.') { |right|
|
46
|
+
options.right = right.to_i
|
47
|
+
}
|
48
|
+
opt.on('-l', '--language LANGUAGE', 'Loads the specified language resource.') { |lang|
|
49
|
+
options.language = lang
|
50
|
+
}
|
51
|
+
|
52
|
+
opt.separator ""
|
53
|
+
opt.on_tail('-h', '--help', 'Shows this help') {
|
54
|
+
$stderr.puts opt
|
55
|
+
exit 0
|
56
|
+
}
|
57
|
+
opt.on_tail('-v', '--version', 'Display the program and library version.') {
|
58
|
+
$stderr.puts "#{File.basename($0)}: Text::Hyphen version #{Text::Hyphen::VERSION}"
|
59
|
+
exit 0
|
60
|
+
}
|
61
|
+
opt.parse!
|
62
|
+
end
|
63
|
+
|
64
|
+
if ARGV.empty? and options.action != :stats
|
65
|
+
$stderr.puts ARGV.options
|
66
|
+
exit 0
|
67
|
+
end
|
68
|
+
|
69
|
+
hyphenator = Text::Hyphen.new do |h|
|
70
|
+
h.left = options.left if options.left
|
71
|
+
h.right = options.right if options.right
|
72
|
+
h.language = options.language if options.language
|
73
|
+
end
|
74
|
+
|
75
|
+
case options.action
|
76
|
+
when :visualise
|
77
|
+
size = 80
|
78
|
+
ARGV.each do |word|
|
79
|
+
vis = hyphenator.visualise(word)
|
80
|
+
if (size - vis.size - 1) < 0
|
81
|
+
puts
|
82
|
+
size = 80
|
83
|
+
end
|
84
|
+
size -= (vis.size + 1)
|
85
|
+
print "#{vis} "
|
86
|
+
end
|
87
|
+
when :hyphenate
|
88
|
+
ARGV.each do |word|
|
89
|
+
hyp = hyphenator.hyphenate(word)
|
90
|
+
print "#{word}: "
|
91
|
+
hyp.each { |pt| print "#{word[pt, 1]} " }
|
92
|
+
puts
|
93
|
+
end
|
94
|
+
when :hyphenate_to
|
95
|
+
size = 80
|
96
|
+
ARGV.each do |word|
|
97
|
+
vis = hyphenator.visualise_to(word, options.size)
|
98
|
+
if (size - vis.size - 1) < 0
|
99
|
+
puts
|
100
|
+
size = 80
|
101
|
+
end
|
102
|
+
size -= (vis.size + 1)
|
103
|
+
print "#{vis} "
|
104
|
+
end
|
105
|
+
when :stats
|
106
|
+
puts hyphenator.stats
|
107
|
+
end
|
data/lib/text/hyphen.rb
ADDED
@@ -0,0 +1,289 @@
|
|
1
|
+
module Text; end
|
2
|
+
|
3
|
+
# = Introduction
|
4
|
+
# Text::Hyphen -- hyphenate words using modified versions of TeX
|
5
|
+
# hyphenation patterns.
|
6
|
+
#
|
7
|
+
# == Usage
|
8
|
+
# require 'text/hyphen'
|
9
|
+
# hh = Text::Hyphen.new(:language => 'en_us', :left => 2, :right => 2)
|
10
|
+
# # Defaults to the above
|
11
|
+
# hh = TeX::Hyphen.new
|
12
|
+
#
|
13
|
+
# word = "representation"
|
14
|
+
# points = hyp.hyphenate(word) #=> [3, 5, 8, 10]
|
15
|
+
# puts hyp.visualize(word) #=> rep-re-sen-ta-tion
|
16
|
+
#
|
17
|
+
# en = Text::Hyphen.new(:left => 0, :right => 0)
|
18
|
+
# fr = Text::Hyphen.new(:language = "fr", :left => 0, :right => 0)
|
19
|
+
# puts en.visualise("organiser") #=> or-gan-iser
|
20
|
+
# puts fr.visualise("organiser") #=> or-ga-ni-ser
|
21
|
+
#
|
22
|
+
# == Description
|
23
|
+
# Creates a new Hyphen object and loads the language patterns into
|
24
|
+
# memory. The hyphenator can then be asked for the hyphenation of
|
25
|
+
# a word. If no language is specified, then the language en_us (EN_US)
|
26
|
+
# is used by default.
|
27
|
+
#
|
28
|
+
# Copyright:: Copyright (c) 2004 Austin Ziegler
|
29
|
+
# Version:: 1.0.0
|
30
|
+
# Based On:: <tt>TeX::Hyphen</tt> 0.4 Copyright (c) 2003 - 2004
|
31
|
+
# Martin DeMello and Austin Ziegler, in turn based on
|
32
|
+
# Perl's <tt>TeX::Hyphen</tt>
|
33
|
+
# [http://search.cpan.org/author/JANPAZ/TeX-Hyphen-0.140/lib/TeX/Hyphen.pm]
|
34
|
+
# Copyright (c) 1997 - 2002 Jan Pazdziora
|
35
|
+
#
|
36
|
+
# == Licence
|
37
|
+
# Licensing for Text::Hyphen is unfortunately complex because of the
|
38
|
+
# various copyrights and licences of the source hyphenation files. Some of
|
39
|
+
# these files are available only under the TeX licence and others are
|
40
|
+
# available only under the GNU GPL while others are public domain. Each
|
41
|
+
# language file has these licences embedded within the file. Please
|
42
|
+
# consult each file's licence to ensure that it is compatible with your
|
43
|
+
# application.
|
44
|
+
#
|
45
|
+
# The copyright on the Text::Hyphen application/library and the Ruby
|
46
|
+
# translations of hyphenation files belongs to Austin Ziegler. All other
|
47
|
+
# copyrights on original versions still stand; Text::Hyphen is a derivative
|
48
|
+
# work of these and other projects.
|
49
|
+
#
|
50
|
+
# === Application and Compilation Licences
|
51
|
+
# Text::Hyphen, the application/library is licensed under the same terms
|
52
|
+
# as Ruby. Note that this specifically refers to the contents of
|
53
|
+
# bin/hyphen, lib/text/hyphen.rb, and lib/text/hyphen/language.rb.
|
54
|
+
#
|
55
|
+
# Individual language hyphenation files are NOT licensed under these
|
56
|
+
# terms, but under the following MIT-style licence and the original
|
57
|
+
# hyphenation pattern licenses. The copyright for the original TeX
|
58
|
+
# hyphenation files is held by the original authors; any mistakes in
|
59
|
+
# conversion of these files to Ruby is attributable to the contributors to
|
60
|
+
# the Text::Hyphen package only.
|
61
|
+
#
|
62
|
+
# The compilation package Text::Hyphen is licensed under the same terms as
|
63
|
+
# Ruby.
|
64
|
+
#
|
65
|
+
# === Blanket Language Hyphenation File Licence
|
66
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
67
|
+
# a copy of this software and associated documentation files (the
|
68
|
+
# "Software"), to deal in the Software without restriction, including
|
69
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
70
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
71
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
72
|
+
# the following conditions:
|
73
|
+
#
|
74
|
+
# The above copyright notice and this permission notice shall be included
|
75
|
+
# in all copies or substantial portions of the Software.
|
76
|
+
#
|
77
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
78
|
+
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
79
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
80
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
81
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
82
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
83
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
84
|
+
class Text::Hyphen
|
85
|
+
DEBUG = false
|
86
|
+
VERSION = '1.0.0'
|
87
|
+
|
88
|
+
DEFAULT_MIN_LEFT = 2
|
89
|
+
DEFAULT_MIN_RIGHT = 2
|
90
|
+
|
91
|
+
# No fewer than this number of letters will show up to the left of the
|
92
|
+
# hyphen. This overrides the default specified in the language.
|
93
|
+
attr_accessor :left
|
94
|
+
# No fewer than this number of letters will show up to the right of the
|
95
|
+
# hyphen. This overrides the default specified in the language.
|
96
|
+
attr_accessor :right
|
97
|
+
# The name of the language to be used in hyphenating words. This will be
|
98
|
+
# a two or three character ISO 639 code, with the two character form
|
99
|
+
# being the canonical resource name. This will load the language
|
100
|
+
# hyphenation definitions from text/hyphen/language/<code> as
|
101
|
+
# a Ruby class. The resource 'text/hyphen/language/en_us' defines the
|
102
|
+
# language class Text::Hyphen::Language::EN_US. It also defines the
|
103
|
+
# secondary forms Text::Hyphen::Language::EN and
|
104
|
+
# Text::Hyphen::Language::ENG_US.
|
105
|
+
#
|
106
|
+
# Minimal transformations will be performed on the language code
|
107
|
+
# provided, such that any dashes are converted to underscores (e.g.,
|
108
|
+
# 'en-us' becomes 'en_us') and all characters are regularised. Resource
|
109
|
+
# names will be downcased and class names will be upcased (e.g., 'Pt'
|
110
|
+
# for the Portuguese language becomes 'pt' and 'PT', respectively).
|
111
|
+
#
|
112
|
+
# The language may also be specified as an instance of
|
113
|
+
# Text::Hyphen::Language.
|
114
|
+
attr_accessor :language
|
115
|
+
def language=(lang) #:nodoc:
|
116
|
+
require 'text/hyphen/language' unless defined?(Text::Hyphen::Language)
|
117
|
+
if lang.kind_of?(Text::Hyphen::Language)
|
118
|
+
@iso_language = lang.to_s.split(%r{::}o)[-1].downcase
|
119
|
+
@language = lang
|
120
|
+
else
|
121
|
+
@iso_language = lang.downcase
|
122
|
+
load_language
|
123
|
+
end
|
124
|
+
@iso_language
|
125
|
+
end
|
126
|
+
# Returns the language's ISO 639 ID, e.g., "en_us" or "pt".
|
127
|
+
attr_reader :iso_language
|
128
|
+
|
129
|
+
# The following initializations are equivalent:
|
130
|
+
#
|
131
|
+
# hyp = TeX::Hyphenate.new(:language => "EU")
|
132
|
+
# hyp = TeX::Hyphenate.new { |h| h.language = "EU" }
|
133
|
+
def initialize(options = {}) # :yields self:
|
134
|
+
@iso_language = options[:language]
|
135
|
+
@left = options[:left]
|
136
|
+
@right = options[:right]
|
137
|
+
|
138
|
+
@cache = {}
|
139
|
+
@vcache = {}
|
140
|
+
|
141
|
+
@hyphen = {}
|
142
|
+
@begin_hyphen = {}
|
143
|
+
@end_hyphen = {}
|
144
|
+
@both_hyphen = {}
|
145
|
+
@exception = {}
|
146
|
+
|
147
|
+
@first_load = true
|
148
|
+
yield self if block_given?
|
149
|
+
@first_load = false
|
150
|
+
|
151
|
+
load_language
|
152
|
+
|
153
|
+
@left ||= DEFAULT_MIN_LEFT
|
154
|
+
@right ||= DEFAULT_MIN_RIGHT
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns a list of places where the word can be divided, as
|
158
|
+
#
|
159
|
+
# hyp.hyphenate('representation')
|
160
|
+
#
|
161
|
+
# returns [3, 5, 8, 10]. If the word has been hyphenated previously, it
|
162
|
+
# will be returned from a per-instance cache.
|
163
|
+
def hyphenate(word)
|
164
|
+
word = word.downcase
|
165
|
+
$stderr.puts "Hyphenating #{word}" if DEBUG
|
166
|
+
return @cache[word] if @cache.has_key?(word)
|
167
|
+
res = @language.exceptions[word]
|
168
|
+
return @cache[word] = make_result_list(res) if res
|
169
|
+
|
170
|
+
result = [0] * (word.split(//).size + 1)
|
171
|
+
rightstop = word.split(//).size - @right
|
172
|
+
|
173
|
+
updater = Proc.new do |hash, str, pos|
|
174
|
+
if hash.has_key?(str)
|
175
|
+
$stderr.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
|
176
|
+
hash[str].split(//).each_with_index do |cc, ii|
|
177
|
+
cc = cc.to_i
|
178
|
+
result[ii + pos] = cc if cc > result[ii + pos]
|
179
|
+
end
|
180
|
+
$stderr.print ": #{result}\n" if DEBUG
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
# Walk the word
|
185
|
+
(0..rightstop).each do |pos|
|
186
|
+
restlength = word.length - pos
|
187
|
+
(1..restlength).each do |length|
|
188
|
+
substr = word[pos, length]
|
189
|
+
updater[@language.hyphen, substr, pos]
|
190
|
+
updater[@language.start, substr, pos] if pos.zero?
|
191
|
+
updater[@language.stop, substr, pos] if (length == restlength)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
updater[@language.both, word, 0] if @language.both[word]
|
196
|
+
|
197
|
+
(0..@left).each { |i| result[i] = 0 }
|
198
|
+
((-1 - @right)..(-1)).each { |i| result[i] = 0 }
|
199
|
+
@cache[word] = make_result_list(result)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns a visualization of the hyphenation points, so:
|
203
|
+
#
|
204
|
+
# hyp.visualize('representation')
|
205
|
+
#
|
206
|
+
# returns <tt>rep-re-sen-ta-tion</tt>, at least for English patterns. If
|
207
|
+
# the word has been visualised previously, it will be returned from
|
208
|
+
# a per-instance cache.
|
209
|
+
def visualise(word)
|
210
|
+
return @vcache[word] if @vcache.has_key?(word)
|
211
|
+
w = word.dup
|
212
|
+
hyphenate(w).each_with_index do |pos, n|
|
213
|
+
w[pos.to_i + n, 0] = '-' if pos != 0
|
214
|
+
end
|
215
|
+
@vcache[word] = w
|
216
|
+
end
|
217
|
+
|
218
|
+
alias visualize visualise
|
219
|
+
|
220
|
+
def clear_cache!
|
221
|
+
@cache.clear
|
222
|
+
@vcache.clear
|
223
|
+
end
|
224
|
+
|
225
|
+
# This function will hyphenate a word so that the first point is at most
|
226
|
+
# +size+ characters.
|
227
|
+
def hyphenate_to(word, size)
|
228
|
+
point = hyphenate(word).delete_if { |e| e >= size }.max
|
229
|
+
if point.nil?
|
230
|
+
[nil, word]
|
231
|
+
else
|
232
|
+
[word[0 ... point] + "-", word[point .. -1]]
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Returns statistics
|
237
|
+
def stats
|
238
|
+
_b = @language.both.size
|
239
|
+
_s = @language.start.size
|
240
|
+
_e = @language.stop.size
|
241
|
+
_h = @language.hyphen.size
|
242
|
+
_x = @language.exceptions.size
|
243
|
+
_T = _b + _s + _e + _h + _x
|
244
|
+
|
245
|
+
s = <<-EOS
|
246
|
+
|
247
|
+
The language '%s' contains %d total hyphenation patterns.
|
248
|
+
% 6d patterns are word start patterns.
|
249
|
+
% 6d patterns are word stop patterns.
|
250
|
+
% 6d patterns are word start/stop patterns.
|
251
|
+
% 6d patterns are normal patterns.
|
252
|
+
% 6d patterns are exceptions.
|
253
|
+
|
254
|
+
EOS
|
255
|
+
s % [ @iso_language, _T, _s, _e, _b, _h, _x ]
|
256
|
+
end
|
257
|
+
|
258
|
+
private
|
259
|
+
def updateresult(hash, str, pos) #:nodoc:
|
260
|
+
if hash.has_key?(str)
|
261
|
+
STDERR.print "#{pos}: #{str}: #{hash[str]}" if DEBUG
|
262
|
+
hash[str].split('').each_with_index do |c, i|
|
263
|
+
c = c.to_i
|
264
|
+
@result[i + pos] = c if c > @result[i + pos]
|
265
|
+
end
|
266
|
+
STDERR.puts ": #{@result}" if DEBUG
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def make_result_list(res) #:nodoc:
|
271
|
+
r = []
|
272
|
+
res.each_with_index { |c, i| r << i * (c.to_i % 2) }
|
273
|
+
r.reject { |i| i.to_i == 0 }
|
274
|
+
end
|
275
|
+
|
276
|
+
def load_language
|
277
|
+
return if @first_load
|
278
|
+
|
279
|
+
@iso_language ||= "en_us"
|
280
|
+
|
281
|
+
require "text/hyphen/language/#{@iso_language}"
|
282
|
+
|
283
|
+
@language = Text::Hyphen::Language.const_get(@iso_language.upcase)
|
284
|
+
@left ||= @language.left
|
285
|
+
@right ||= @language.right
|
286
|
+
|
287
|
+
@iso_language
|
288
|
+
end
|
289
|
+
end
|