truty 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e33c09b42c8ceb567b09cf0a0577bb01b20dba5c
4
+ data.tar.gz: 08f1808d7826aeef5738b05af2081799d0606cd5
5
+ SHA512:
6
+ metadata.gz: cb6bd314915bfa2ca1d9d467b39fdc647f0528820d255c73808a1389eddd5687ae3d72deb423bd2c1b2ecc06451d8e483b2067f53fa4db1ec7be3eaf559657d9
7
+ data.tar.gz: f10562c3e8a6215d74cfdf0308a02cfd2b281b724c4eb11116daacf7a0f84a80a7244df7c861b82f58ebadc854267a4791e4f03ed86f23dcbe3e6f4048fad8c3
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Matěj Kašpar Jirásek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,4 @@
1
+
2
+ # Truty
3
+
4
+ This is a ruby gem in development which is a simple string converter, which aims to fix all the typography imperfections of the plain text.
data/bin/truty ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby -rubygems
2
+
3
+ require "truty"
4
+
5
+ def main
6
+ puts Truty.fix_czech_text(ARGF.read)
7
+ end
8
+
9
+ main
data/lib/truty.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'uri'
4
+ require 'text/hyphen'
5
+ require 'truty/general'
6
+ require 'truty/czech'
7
+
8
+ # A Ruby library which is a simple string converter, which aims to fix all the typography imperfections of the plain text.
9
+ # @author Matěj Kašpar Jirásek
10
+ module Truty
11
+
12
+ extend General
13
+ extend Czech
14
+
15
+ end
16
+
@@ -0,0 +1,67 @@
1
+
2
+ module Truty
3
+
4
+ # Module with specific Czech typography fixes.
5
+ # @author Matěj Kašpar Jirásek
6
+ module Czech
7
+
8
+ # Improves the typography of the large plain text with paragraphs. Adds non-breaking spaces, hyphenation, fixes dashes, etc. Fixes some typography fixes specific for the Czech languages, like one character prepositions, abbreviations and spaces between numbers.
9
+ #
10
+ # @param input [String] The text which will be converted.
11
+ # @return [String] Text with improved typography.
12
+ def fix_czech_text(input)
13
+ input.split("\n").collect { |p| fix_czech_paragraph(p) }.join("\n")
14
+ end
15
+
16
+ # Improves the Czech typography of single paragraph. If you supply more paragraphs you might lose some improvements like widows. For improving longer text see {#fix_czech_text}.
17
+ #
18
+ # @param input [String] The paragraph which will be converted.
19
+ # @return [String] Paragraph with improved typography.
20
+ def fix_czech_paragraph(input)
21
+ output = input
22
+ output = ellipsis(output)
23
+ output = fix_multicharacters(output)
24
+ output = fix_punctuation_whitespace(output)
25
+ output = fix_brackets_whitespace(output)
26
+ output = add_soft_hyphens(output, "cs")
27
+ output = emdash_spaces(output)
28
+ output = endash_spaces(output)
29
+ output = fix_double_quotes(output, "„", "“")
30
+ output = fix_single_quotes(output, "‚", "‘")
31
+ output = fix_multiplication_sign(output)
32
+ output = fix_space_between_numbers(output)
33
+ output = fix_units(output)
34
+ output = fix_trailing_spaces(output)
35
+ output = fix_widows(output)
36
+ output = fix_long_czech_numbers(output)
37
+ output = fix_czech_one_character_words(output)
38
+ output = fix_czech_abbreviations(output)
39
+ end
40
+
41
+ # Adds non-breaking space after Czech one character prepostion.
42
+ #
43
+ # @param input [String] The paragraph which will be converted.
44
+ # @return [String] Paragraph with non-breaking spaces after prepositions.
45
+ def fix_czech_one_character_words(input)
46
+ input.gsub(/(\s+|^|\A)(([aikosuvz]\s+)+)/i) { |prep| $1 + $2.gsub(/\s+/, " ") }
47
+ end
48
+
49
+ # Divides long numbers into parts of three digits using thin space.
50
+ #
51
+ # @param input [String] The paragraph which will be converted.
52
+ # @return [String] Paragraph with spaces inside of long numbers.
53
+ def fix_long_czech_numbers(input)
54
+ input.gsub(/\d+/) { |n| n.reverse.scan(/(.{1,3})/).join(' ').reverse }
55
+ end
56
+
57
+ # Adds non-breaking spaces in and after Czech abbreviations.
58
+ #
59
+ # @param input [String] The paragraph which will be converted.
60
+ # @return [String] Paragraph with non-breaking spaces in and after abbreviations.
61
+ def fix_czech_abbreviations(input)
62
+ abbreviations = /(a. s.|abl. |absol. |adj. |adm. |adv. |aj.|ak. |ak. sl.|akt. |alch. |amer. |anat. |angl. |anglosas. |ap.|apod.|arab. |arch. |archit. |arg. |arm. gen. |astr. |astrol. |atd.|atp.|att. |b. k.|Bc. |BcA. |belg. |bibl. |biol. |bl. |boh. |bot. |br. |brig. gen. |brit. |bulh. |bás. |býv. |chcsl. |chem. |chil. |CSc. |csl. |círk. |dat. |dep. |des. |dial. |DiS.|dl. |doc. |dol. |dop. |dopr. |dosl. |dán. |dór. |děj. |dět. |ekon. |epic. |etnonym. |eufem. |ev. |event. |f. |fam. |fem. |fil. |film. |fin. |form. |fot. |fr. |fut. |fyz. |gen. |genmjr. |genplk. |genpor. |geogr. |geol. |geom. |germ. |gram. |hebr. |herald. |hist. |hl. |hod. |hor. |horn. |hovor. |hud. |hut. |ie. |imp. |impf. |ind. |indoevr. |inf. |Ing. |instr. |interj. |iron. |it. |ión. |j. č.|jap. |JUDr. |k. s.|kanad. |katalán. |klas. |kniž. |komp. |konj. |konkr. |kpt. |kr. |kuch. |kř. |lat. |les. |lid. |lit. |liturg. |log. |lok. |lék. |m. |mat. |meteor. |metr. |MgA. |Mgr. |mil. |mj. |mjr. |ml. |mld. |mn. č.|mod. |ms. |MUDr. |MVDr. |mysl. |n. |n. l.|např. |neklas. |nesklon. |než. |niz. |nom. |nor. |npor. |nprap. |nrtm. |nstržm. |náb. |nám. |námoř. |něm. |o. p. s.|o. s.|ob. |obch. |obyč. |odd. |odp. |ojed. |opt. |p. |p. n. l.|p. o.|P. S. |P. T. |part. |pas. |pejor. |pers. |pf. |PharmDr. |PhDr. |pl. |plk. |plpf. |po Kr.|pol. |pomn. |popř. |por. |pplk. |ppor. |pprap. |prap. |prep. |prof. |práv. |př. Kr.|př. n. l.|před n. l.|předl. |přivl. |r. |rak. |rcsl. |refl. |reg. |resp. |rkp. |RNDr. |roč. |RSDr. |rtm. |rtn. |rum. |rus. |s. |s. p.|s. r. o.|samohl. |Sb. |sg. |sl. |slang. |slov. |souhl. |spec. |spol. s r. o.|sport. |srov. |st. |stfr. |stol. |str. |stržm. |stsl. |střv. |subj. |subst. |superl. |sv. |svob. |sz. |t. r.|tech. |telev. |teol. |ThDr. |tis. |tj. |trans. |tur. |typogr. |tzn. |tzv. |táz. |v z.|v. o. s.|v. r.|v. v. i.|var. |vedl. |verb. |vl. jm. |voj. |vok. |vulg. |vztaž. |výtv. |vč. |vůb. |z. s.|zahr. |zast. |zejm. |zeměd. |zkr. |zn. |zvl. |zájm. |zř. |č. |č. j.|č. p. |čas. |čes. |čet. |čj. |čp. |čín. |čís. |ř. |řec. |říj. |škpt. |špan. |šprap. |št. prap. |švýc. )/i
63
+ input.gsub(abbreviations) { |abbr| abbr.gsub(/ /, ' ') }
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,184 @@
1
+
2
+ module Truty
3
+
4
+ # Module with general typography fixes for all the languages. The fixes in here should not be language specific.
5
+ # @author Matěj Kašpar Jirásek
6
+ module General
7
+
8
+ # Improves the typography of the large plain text with paragraphs. Adds non-breaking spaces, hyphenation, fixes dashes, etc.
9
+ #
10
+ # @param input [String] The text which will be converted.
11
+ # @param lang [String] Sets the language of hyphenation. (See {#add_soft_hyphens}.)
12
+ # @return [String] Text with improved typography.
13
+ def fix(input, lang = "en_us")
14
+ input.split("\n").collect { |p| fix_paragraph(p, lang) }.join("\n")
15
+ end
16
+
17
+ # Improves the typography of single paragraph. If you supply more paragraphs you might lose some improvements like widows. For improving longer text see {#fix}.
18
+ #
19
+ # @param input [String] The paragraph which will be converted.
20
+ # @param lang [String] Sets the language of hyphenation. (See {#add_soft_hyphens}.)
21
+ # @return [String] Paragraph with improved typography.
22
+ def fix_paragraph(input, lang = "en_us")
23
+ output = input
24
+ output = ellipsis(output)
25
+ output = fix_multicharacters(output)
26
+ output = fix_punctuation_whitespace(output)
27
+ output = fix_brackets_whitespace(output)
28
+ output = add_soft_hyphens(output, lang)
29
+ output = emdash_spaces(output)
30
+ output = endash_spaces(output)
31
+ output = fix_double_quotes(output, "„", "“")
32
+ output = fix_single_quotes(output, "‚", "‘")
33
+ output = fix_multiplication_sign(output)
34
+ output = fix_space_between_numbers(output)
35
+ output = fix_units(output)
36
+ output = fix_trailing_spaces(output)
37
+ output = fix_widows(output)
38
+ end
39
+
40
+ # Converts three or more periods (dots, points) into ellipsis.
41
+ #
42
+ # @param input [String] The paragraph which will be converted.
43
+ # @return [String] Paragraph with ellipses.
44
+ def ellipsis(input)
45
+ input.gsub(/\.{3,}/, "…")
46
+ end
47
+
48
+ # Adds thin spaces to emdash from both sides. Also converts two or three hyphens to emdash.
49
+ #
50
+ # @param input [String] The paragraph which will be converted.
51
+ # @return [String] Paragraph with corrected emdashes.
52
+ def emdash_spaces(input)
53
+ input.gsub(/\s+(—|-{2,3})\s+/, " — ")
54
+ end
55
+
56
+ # Adds non-breaking space before endash.
57
+ #
58
+ # @param input [String] The paragraph which will be converted.
59
+ # @return [String] Paragraph with corrected endashes.
60
+ def endash_spaces(input)
61
+ input.gsub(/\s+(–|-)\s+/, " – ")
62
+ end
63
+
64
+ # Adds soft hyphens to the input.
65
+ #
66
+ # @param input [String] The paragraph which will be converted.
67
+ # @param lang [String] Sets the language of hyphenation. One of the languages a {http://www.rubydoc.info/gems/text-hyphen/ text-hyphen gem} can use.
68
+ # @param left [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
69
+ # @param right [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
70
+ # @param char [Integer] The character which will be added to hyphenation places.
71
+ # @return [String] Paragraph with added hyphenation characters.
72
+ def add_soft_hyphens(input, lang = "en_us", left = 2, right = 2, char = "­")
73
+ l = Text::Hyphen.new(:language => lang, :left => left, :right => right)
74
+ words = input.split(/[ ]+/m)
75
+ result = []
76
+ words.each_with_index do |w, n|
77
+ if !(w.length < 6 || n == words.size - 1 || w =~ URI::regexp || w =~ /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+\z/i)
78
+ w = l.visualise(w, char)
79
+ end
80
+ result << w
81
+ end
82
+ result.join(" ")
83
+ end
84
+
85
+ # Converts simple double quotes to the typograhic ones.
86
+ #
87
+ # @param input [String] The paragraph which will be converted.
88
+ # @param start_quotes [String] The character used for starting quotes.
89
+ # @param end_quotes [String] The character used for ending quotes.
90
+ # @return [String] Paragraph with correct double quotes.
91
+ def fix_double_quotes(input, start_quotes = "“", end_quotes = "”")
92
+ input.gsub(/"[^"]*"/) { |s| start_quotes + s[1..-2].strip + end_quotes }
93
+ end
94
+
95
+ # Converts simple single quotes to the typograhic ones.
96
+ #
97
+ # @param input [String] The paragraph which will be converted.
98
+ # @param start_quotes [String] The character used for starting quotes.
99
+ # @param end_quotes [String] The character used for ending quotes.
100
+ # @return [String] Paragraph with correct single quotes.
101
+ def fix_single_quotes(input, start_quotes = "‘", end_quotes = "’")
102
+ input.gsub(/'[^']*'/) { |s| start_quotes + s[1..-2].strip + end_quotes }
103
+ end
104
+
105
+ # Adds multiplication sign between numbers instead of X.
106
+ #
107
+ # @param input [String] The paragraph which will be converted.
108
+ # @return [String] Paragraph with correct multiplication signs.
109
+ def fix_multiplication_sign(input)
110
+ output = input.gsub(/(\d+)\s{0,1}[Xx]\s{0,1}(\d+)/, '\1 × \2')
111
+ output = output.gsub(/(\d+)[Xx]/, '\1×')
112
+ end
113
+
114
+ # Adds thin non-breaking space between numbers.
115
+ #
116
+ # @param input [String] The paragraph which will be converted.
117
+ # @return [String] Paragraph with correct spaces between numbers.
118
+ def fix_space_between_numbers(input)
119
+ input.gsub(/(\d)\s+(\d)/, '\1 \2')
120
+ end
121
+
122
+ # Fixes spaces around various brackets.
123
+ #
124
+ # @param input [String] The paragraph which will be converted.
125
+ # @return [String] Paragraph with correct spaces around brackets.
126
+ def fix_brackets_whitespace(input)
127
+ output = input.gsub(/([\(\[\{])\s*/, '\1')
128
+ output = output.gsub(/\s*([\]\)\}])/, '\1')
129
+ output = output.gsub(/\s+([\(\[\{])\s*/, ' \1')
130
+ output = output.gsub(/\s*([\]\)\}])\s+/, '\1 ')
131
+ end
132
+
133
+ # Tries to substitute more characters which should be one, like "©", "™", etc.
134
+ #
135
+ # @param input [String] The paragraph which will be converted.
136
+ # @return [String] Paragraph with converted characters.
137
+ def fix_multicharacters(input)
138
+ output = input.gsub(/\([Cc]\)/, "©")
139
+ output = output.gsub(/\([Pp]\)/, "℗")
140
+ output = output.gsub(/\([Rr]\)/, "®")
141
+ output = output.gsub(/\((SM|sm|Sm)\)/, "℠")
142
+ output = output.gsub(/\((TM|tm|Tm)\)/, "™")
143
+ output = output.gsub(/\+-/, "±")
144
+ output = output.gsub(/-\+/, "∓")
145
+ output = output.gsub(/No.?\s*(\d+)/i, '№\1')
146
+ output = output.gsub(/°C/, '℃')
147
+ output = output.gsub(/°F/, '℉')
148
+ end
149
+
150
+ # Fixes spaces around punctuation.
151
+ #
152
+ # @param input [String] The paragraph which will be converted.
153
+ # @return [String] Paragraph with correct spaces around punctuation.
154
+ def fix_punctuation_whitespace(input)
155
+ input.gsub(/\s*([\!\?\.,;:…]+)\s*/, '\1 ')
156
+ end
157
+
158
+ # Fixes non-breaking spaces betwwen number and unit.
159
+ #
160
+ # @param input [String] The paragraph which will be converted.
161
+ # @return [String] Paragraph with correct spaces between number and unit.
162
+ def fix_units(input)
163
+ output = input.gsub(/(\d+)\s+(%|‰|‱|℃|℉|°|€|Kč|(Y|Z|E|P|T|G|M|k|h|da|d|m|µ|n|p|f|a|z|y)?(m(²|³)?|g|s|h|A|K|cd|mol|Ω|℃|℉))/, '\1 \2')
164
+ output.gsub(/(\*|§|#|†)\s+(\d+)/, '\1 \2')
165
+ end
166
+
167
+ # Adds non-breaking space before the last word in the paragraph.
168
+ #
169
+ # @param input [String] The paragraph which will be converted.
170
+ # @return [String] Paragraph with removed widows.
171
+ def fix_widows(input)
172
+ input.gsub(/(\s)(\S+(\$|\z))/, ' \2')
173
+ end
174
+
175
+ # Removes whitespace after the end of the paragraph.
176
+ #
177
+ # @param input [String] The paragraph which will be converted.
178
+ # @return [String] Paragraph without trailing spaces.
179
+ def fix_trailing_spaces(input)
180
+ input.gsub(/\s*($|\z)/, '')
181
+ end
182
+
183
+ end
184
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: truty
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matěj Kašpar Jirásek
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: text-hyphen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.4'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ description: A string converter which aims to correct the typography.
56
+ email: matej.jirasek@me.com
57
+ executables:
58
+ - truty
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - LICENSE
63
+ - README.md
64
+ - bin/truty
65
+ - lib/truty.rb
66
+ - lib/truty/czech.rb
67
+ - lib/truty/general.rb
68
+ homepage: https://github.com/mkj-is/Truty
69
+ licenses:
70
+ - MIT
71
+ metadata: {}
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubyforge_project:
88
+ rubygems_version: 2.4.3
89
+ signing_key:
90
+ specification_version: 4
91
+ summary: True typography converter
92
+ test_files: []
93
+ has_rdoc: