truty 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e33c09b42c8ceb567b09cf0a0577bb01b20dba5c
4
+ data.tar.gz: 08f1808d7826aeef5738b05af2081799d0606cd5
5
+ SHA512:
6
+ metadata.gz: cb6bd314915bfa2ca1d9d467b39fdc647f0528820d255c73808a1389eddd5687ae3d72deb423bd2c1b2ecc06451d8e483b2067f53fa4db1ec7be3eaf559657d9
7
+ data.tar.gz: f10562c3e8a6215d74cfdf0308a02cfd2b281b724c4eb11116daacf7a0f84a80a7244df7c861b82f58ebadc854267a4791e4f03ed86f23dcbe3e6f4048fad8c3
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Matěj Kašpar Jirásek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,4 @@
1
+
2
+ # Truty
3
+
4
+ This is a ruby gem in development which is a simple string converter, which aims to fix all the typography imperfections of the plain text.
data/bin/truty ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby -rubygems
2
+
3
+ require "truty"
4
+
5
+ def main
6
+ puts Truty.fix_czech_text(ARGF.read)
7
+ end
8
+
9
+ main
data/lib/truty.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'uri'
4
+ require 'text/hyphen'
5
+ require 'truty/general'
6
+ require 'truty/czech'
7
+
8
+ # A Ruby library which is a simple string converter, which aims to fix all the typography imperfections of the plain text.
9
+ # @author Matěj Kašpar Jirásek
10
+ module Truty
11
+
12
+ extend General
13
+ extend Czech
14
+
15
+ end
16
+
@@ -0,0 +1,67 @@
1
+
2
+ module Truty
3
+
4
+ # Module with specific Czech typography fixes.
5
+ # @author Matěj Kašpar Jirásek
6
+ module Czech
7
+
8
+ # Improves the typography of the large plain text with paragraphs. Adds non-breaking spaces, hyphenation, fixes dashes, etc. Fixes some typography fixes specific for the Czech languages, like one character prepositions, abbreviations and spaces between numbers.
9
+ #
10
+ # @param input [String] The text which will be converted.
11
+ # @return [String] Text with improved typography.
12
+ def fix_czech_text(input)
13
+ input.split("\n").collect { |p| fix_czech_paragraph(p) }.join("\n")
14
+ end
15
+
16
+ # Improves the Czech typography of single paragraph. If you supply more paragraphs you might lose some improvements like widows. For improving longer text see {#fix_czech_text}.
17
+ #
18
+ # @param input [String] The paragraph which will be converted.
19
+ # @return [String] Paragraph with improved typography.
20
+ def fix_czech_paragraph(input)
21
+ output = input
22
+ output = ellipsis(output)
23
+ output = fix_multicharacters(output)
24
+ output = fix_punctuation_whitespace(output)
25
+ output = fix_brackets_whitespace(output)
26
+ output = add_soft_hyphens(output, "cs")
27
+ output = emdash_spaces(output)
28
+ output = endash_spaces(output)
29
+ output = fix_double_quotes(output, "„", "“")
30
+ output = fix_single_quotes(output, "‚", "‘")
31
+ output = fix_multiplication_sign(output)
32
+ output = fix_space_between_numbers(output)
33
+ output = fix_units(output)
34
+ output = fix_trailing_spaces(output)
35
+ output = fix_widows(output)
36
+ output = fix_long_czech_numbers(output)
37
+ output = fix_czech_one_character_words(output)
38
+ output = fix_czech_abbreviations(output)
39
+ end
40
+
41
+ # Adds non-breaking space after Czech one character prepostion.
42
+ #
43
+ # @param input [String] The paragraph which will be converted.
44
+ # @return [String] Paragraph with non-breaking spaces after prepositions.
45
+ def fix_czech_one_character_words(input)
46
+ input.gsub(/(\s+|^|\A)(([aikosuvz]\s+)+)/i) { |prep| $1 + $2.gsub(/\s+/, " ") }
47
+ end
48
+
49
+ # Divides long numbers into parts of three digits using thin space.
50
+ #
51
+ # @param input [String] The paragraph which will be converted.
52
+ # @return [String] Paragraph with spaces inside of long numbers.
53
+ def fix_long_czech_numbers(input)
54
+ input.gsub(/\d+/) { |n| n.reverse.scan(/(.{1,3})/).join(' ').reverse }
55
+ end
56
+
57
+ # Adds non-breaking spaces in and after Czech abbreviations.
58
+ #
59
+ # @param input [String] The paragraph which will be converted.
60
+ # @return [String] Paragraph with non-breaking spaces in and after abbreviations.
61
+ def fix_czech_abbreviations(input)
62
+ abbreviations = /(a. s.|abl. |absol. |adj. |adm. |adv. |aj.|ak. |ak. sl.|akt. |alch. |amer. |anat. |angl. |anglosas. |ap.|apod.|arab. |arch. |archit. |arg. |arm. gen. |astr. |astrol. |atd.|atp.|att. |b. k.|Bc. |BcA. |belg. |bibl. |biol. |bl. |boh. |bot. |br. |brig. gen. |brit. |bulh. |bás. |býv. |chcsl. |chem. |chil. |CSc. |csl. |círk. |dat. |dep. |des. |dial. |DiS.|dl. |doc. |dol. |dop. |dopr. |dosl. |dán. |dór. |děj. |dět. |ekon. |epic. |etnonym. |eufem. |ev. |event. |f. |fam. |fem. |fil. |film. |fin. |form. |fot. |fr. |fut. |fyz. |gen. |genmjr. |genplk. |genpor. |geogr. |geol. |geom. |germ. |gram. |hebr. |herald. |hist. |hl. |hod. |hor. |horn. |hovor. |hud. |hut. |ie. |imp. |impf. |ind. |indoevr. |inf. |Ing. |instr. |interj. |iron. |it. |ión. |j. č.|jap. |JUDr. |k. s.|kanad. |katalán. |klas. |kniž. |komp. |konj. |konkr. |kpt. |kr. |kuch. |kř. |lat. |les. |lid. |lit. |liturg. |log. |lok. |lék. |m. |mat. |meteor. |metr. |MgA. |Mgr. |mil. |mj. |mjr. |ml. |mld. |mn. č.|mod. |ms. |MUDr. |MVDr. |mysl. |n. |n. l.|např. |neklas. |nesklon. |než. |niz. |nom. |nor. |npor. |nprap. |nrtm. |nstržm. |náb. |nám. |námoř. |něm. |o. p. s.|o. s.|ob. |obch. |obyč. |odd. |odp. |ojed. |opt. |p. |p. n. l.|p. o.|P. S. |P. T. |part. |pas. |pejor. |pers. |pf. |PharmDr. |PhDr. |pl. |plk. |plpf. |po Kr.|pol. |pomn. |popř. |por. |pplk. |ppor. |pprap. |prap. |prep. |prof. |práv. |př. Kr.|př. n. l.|před n. l.|předl. |přivl. |r. |rak. |rcsl. |refl. |reg. |resp. |rkp. |RNDr. |roč. |RSDr. |rtm. |rtn. |rum. |rus. |s. |s. p.|s. r. o.|samohl. |Sb. |sg. |sl. |slang. |slov. |souhl. |spec. |spol. s r. o.|sport. |srov. |st. |stfr. |stol. |str. |stržm. |stsl. |střv. |subj. |subst. |superl. |sv. |svob. |sz. |t. r.|tech. |telev. |teol. |ThDr. |tis. |tj. |trans. |tur. |typogr. |tzn. |tzv. |táz. |v z.|v. o. s.|v. r.|v. v. i.|var. |vedl. |verb. |vl. jm. |voj. |vok. |vulg. |vztaž. |výtv. |vč. |vůb. |z. s.|zahr. |zast. |zejm. |zeměd. |zkr. |zn. |zvl. |zájm. |zř. |č. |č. j.|č. p. |čas. |čes. |čet. |čj. |čp. |čín. |čís. |ř. |řec. |říj. |škpt. |špan. |šprap. |št. prap. |švýc. )/i
63
+ input.gsub(abbreviations) { |abbr| abbr.gsub(/ /, ' ') }
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,184 @@
1
+
2
+ module Truty
3
+
4
+ # Module with general typography fixes for all the languages. The fixes in here should not be language specific.
5
+ # @author Matěj Kašpar Jirásek
6
+ module General
7
+
8
+ # Improves the typography of the large plain text with paragraphs. Adds non-breaking spaces, hyphenation, fixes dashes, etc.
9
+ #
10
+ # @param input [String] The text which will be converted.
11
+ # @param lang [String] Sets the language of hyphenation. (See {#add_soft_hyphens}.)
12
+ # @return [String] Text with improved typography.
13
+ def fix(input, lang = "en_us")
14
+ input.split("\n").collect { |p| fix_paragraph(p, lang) }.join("\n")
15
+ end
16
+
17
+ # Improves the typography of single paragraph. If you supply more paragraphs you might lose some improvements like widows. For improving longer text see {#fix}.
18
+ #
19
+ # @param input [String] The paragraph which will be converted.
20
+ # @param lang [String] Sets the language of hyphenation. (See {#add_soft_hyphens}.)
21
+ # @return [String] Paragraph with improved typography.
22
+ def fix_paragraph(input, lang = "en_us")
23
+ output = input
24
+ output = ellipsis(output)
25
+ output = fix_multicharacters(output)
26
+ output = fix_punctuation_whitespace(output)
27
+ output = fix_brackets_whitespace(output)
28
+ output = add_soft_hyphens(output, lang)
29
+ output = emdash_spaces(output)
30
+ output = endash_spaces(output)
31
+ output = fix_double_quotes(output, "„", "“")
32
+ output = fix_single_quotes(output, "‚", "‘")
33
+ output = fix_multiplication_sign(output)
34
+ output = fix_space_between_numbers(output)
35
+ output = fix_units(output)
36
+ output = fix_trailing_spaces(output)
37
+ output = fix_widows(output)
38
+ end
39
+
40
+ # Converts three or more periods (dots, points) into ellipsis.
41
+ #
42
+ # @param input [String] The paragraph which will be converted.
43
+ # @return [String] Paragraph with ellipses.
44
+ def ellipsis(input)
45
+ input.gsub(/\.{3,}/, "…")
46
+ end
47
+
48
+ # Adds thin spaces to emdash from both sides. Also converts two or three hyphens to emdash.
49
+ #
50
+ # @param input [String] The paragraph which will be converted.
51
+ # @return [String] Paragraph with corrected emdashes.
52
+ def emdash_spaces(input)
53
+ input.gsub(/\s+(—|-{2,3})\s+/, " — ")
54
+ end
55
+
56
+ # Adds non-breaking space before endash.
57
+ #
58
+ # @param input [String] The paragraph which will be converted.
59
+ # @return [String] Paragraph with corrected endashes.
60
+ def endash_spaces(input)
61
+ input.gsub(/\s+(–|-)\s+/, " – ")
62
+ end
63
+
64
+ # Adds soft hyphens to the input.
65
+ #
66
+ # @param input [String] The paragraph which will be converted.
67
+ # @param lang [String] Sets the language of hyphenation. One of the languages a {http://www.rubydoc.info/gems/text-hyphen/ text-hyphen gem} can use.
68
+ # @param left [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
69
+ # @param right [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
70
+ # @param char [Integer] The character which will be added to hyphenation places.
71
+ # @return [String] Paragraph with added hyphenation characters.
72
+ def add_soft_hyphens(input, lang = "en_us", left = 2, right = 2, char = "­")
73
+ l = Text::Hyphen.new(:language => lang, :left => left, :right => right)
74
+ words = input.split(/[ ]+/m)
75
+ result = []
76
+ words.each_with_index do |w, n|
77
+ if !(w.length < 6 || n == words.size - 1 || w =~ URI::regexp || w =~ /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+\z/i)
78
+ w = l.visualise(w, char)
79
+ end
80
+ result << w
81
+ end
82
+ result.join(" ")
83
+ end
84
+
85
+ # Converts simple double quotes to the typograhic ones.
86
+ #
87
+ # @param input [String] The paragraph which will be converted.
88
+ # @param start_quotes [String] The character used for starting quotes.
89
+ # @param end_quotes [String] The character used for ending quotes.
90
+ # @return [String] Paragraph with correct double quotes.
91
+ def fix_double_quotes(input, start_quotes = "“", end_quotes = "”")
92
+ input.gsub(/"[^"]*"/) { |s| start_quotes + s[1..-2].strip + end_quotes }
93
+ end
94
+
95
+ # Converts simple single quotes to the typograhic ones.
96
+ #
97
+ # @param input [String] The paragraph which will be converted.
98
+ # @param start_quotes [String] The character used for starting quotes.
99
+ # @param end_quotes [String] The character used for ending quotes.
100
+ # @return [String] Paragraph with correct single quotes.
101
+ def fix_single_quotes(input, start_quotes = "‘", end_quotes = "’")
102
+ input.gsub(/'[^']*'/) { |s| start_quotes + s[1..-2].strip + end_quotes }
103
+ end
104
+
105
+ # Adds multiplication sign between numbers instead of X.
106
+ #
107
+ # @param input [String] The paragraph which will be converted.
108
+ # @return [String] Paragraph with correct multiplication signs.
109
+ def fix_multiplication_sign(input)
110
+ output = input.gsub(/(\d+)\s{0,1}[Xx]\s{0,1}(\d+)/, '\1 × \2')
111
+ output = output.gsub(/(\d+)[Xx]/, '\1×')
112
+ end
113
+
114
+ # Adds thin non-breaking space between numbers.
115
+ #
116
+ # @param input [String] The paragraph which will be converted.
117
+ # @return [String] Paragraph with correct spaces between numbers.
118
+ def fix_space_between_numbers(input)
119
+ input.gsub(/(\d)\s+(\d)/, '\1 \2')
120
+ end
121
+
122
+ # Fixes spaces around various brackets.
123
+ #
124
+ # @param input [String] The paragraph which will be converted.
125
+ # @return [String] Paragraph with correct spaces around brackets.
126
+ def fix_brackets_whitespace(input)
127
+ output = input.gsub(/([\(\[\{])\s*/, '\1')
128
+ output = output.gsub(/\s*([\]\)\}])/, '\1')
129
+ output = output.gsub(/\s+([\(\[\{])\s*/, ' \1')
130
+ output = output.gsub(/\s*([\]\)\}])\s+/, '\1 ')
131
+ end
132
+
133
+ # Tries to substitute more characters which should be one, like "©", "™", etc.
134
+ #
135
+ # @param input [String] The paragraph which will be converted.
136
+ # @return [String] Paragraph with converted characters.
137
+ def fix_multicharacters(input)
138
+ output = input.gsub(/\([Cc]\)/, "©")
139
+ output = output.gsub(/\([Pp]\)/, "℗")
140
+ output = output.gsub(/\([Rr]\)/, "®")
141
+ output = output.gsub(/\((SM|sm|Sm)\)/, "℠")
142
+ output = output.gsub(/\((TM|tm|Tm)\)/, "™")
143
+ output = output.gsub(/\+-/, "±")
144
+ output = output.gsub(/-\+/, "∓")
145
+ output = output.gsub(/No.?\s*(\d+)/i, '№\1')
146
+ output = output.gsub(/°C/, '℃')
147
+ output = output.gsub(/°F/, '℉')
148
+ end
149
+
150
+ # Fixes spaces around punctuation.
151
+ #
152
+ # @param input [String] The paragraph which will be converted.
153
+ # @return [String] Paragraph with correct spaces around punctuation.
154
+ def fix_punctuation_whitespace(input)
155
+ input.gsub(/\s*([\!\?\.,;:…]+)\s*/, '\1 ')
156
+ end
157
+
158
+ # Fixes non-breaking spaces betwwen number and unit.
159
+ #
160
+ # @param input [String] The paragraph which will be converted.
161
+ # @return [String] Paragraph with correct spaces between number and unit.
162
+ def fix_units(input)
163
+ output = input.gsub(/(\d+)\s+(%|‰|‱|℃|℉|°|€|Kč|(Y|Z|E|P|T|G|M|k|h|da|d|m|µ|n|p|f|a|z|y)?(m(²|³)?|g|s|h|A|K|cd|mol|Ω|℃|℉))/, '\1 \2')
164
+ output.gsub(/(\*|§|#|†)\s+(\d+)/, '\1 \2')
165
+ end
166
+
167
+ # Adds non-breaking space before the last word in the paragraph.
168
+ #
169
+ # @param input [String] The paragraph which will be converted.
170
+ # @return [String] Paragraph with removed widows.
171
+ def fix_widows(input)
172
+ input.gsub(/(\s)(\S+(\$|\z))/, ' \2')
173
+ end
174
+
175
+ # Removes whitespace after the end of the paragraph.
176
+ #
177
+ # @param input [String] The paragraph which will be converted.
178
+ # @return [String] Paragraph without trailing spaces.
179
+ def fix_trailing_spaces(input)
180
+ input.gsub(/\s*($|\z)/, '')
181
+ end
182
+
183
+ end
184
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: truty
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matěj Kašpar Jirásek
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: text-hyphen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.4'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ description: A string converter which aims to correct the typography.
56
+ email: matej.jirasek@me.com
57
+ executables:
58
+ - truty
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - LICENSE
63
+ - README.md
64
+ - bin/truty
65
+ - lib/truty.rb
66
+ - lib/truty/czech.rb
67
+ - lib/truty/general.rb
68
+ homepage: https://github.com/mkj-is/Truty
69
+ licenses:
70
+ - MIT
71
+ metadata: {}
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubyforge_project:
88
+ rubygems_version: 2.4.3
89
+ signing_key:
90
+ specification_version: 4
91
+ summary: True typography converter
92
+ test_files: []
93
+ has_rdoc: