textutils 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +2 -0
- data/lib/textutils.rb +3 -0
- data/lib/textutils/core_ext/file.rb +15 -0
- data/lib/textutils/core_ext/time.rb +19 -0
- data/lib/textutils/helper/title_helper.rb +43 -4
- data/lib/textutils/utils.rb +0 -15
- data/lib/textutils/version.rb +1 -1
- data/test/test_title_helper.rb +31 -1
- metadata +13 -11
data/Manifest.txt
CHANGED
data/lib/textutils.rb
CHANGED
@@ -42,6 +42,9 @@ require 'textutils/helper/address_helper'
|
|
42
42
|
require 'textutils/helper/value_helper'
|
43
43
|
|
44
44
|
require 'textutils/utils'
|
45
|
+
require 'textutils/core_ext/file'
|
46
|
+
require 'textutils/core_ext/time'
|
47
|
+
|
45
48
|
require 'textutils/reader/code_reader'
|
46
49
|
require 'textutils/reader/hash_reader'
|
47
50
|
require 'textutils/reader/hash_reader_v2'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
class File
|
4
|
+
def self.read_utf8( path )
|
5
|
+
text = open( path, 'r:bom|utf-8' ) do |file|
|
6
|
+
file.read
|
7
|
+
end
|
8
|
+
|
9
|
+
# NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
|
10
|
+
text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
|
11
|
+
|
12
|
+
text
|
13
|
+
end
|
14
|
+
end # class File
|
15
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class Time
|
5
|
+
|
6
|
+
def self.cet( str ) # central european time (cet) + central european summer time (cest)
|
7
|
+
ActiveSupport::TimeZone['Vienna'].parse( str )
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.eet( str ) # eastern european time (eet) + 2 hours
|
11
|
+
ActiveSupport::TimeZone['Bucharest'].parse( str )
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.cst( str ) # central standard time (cst) - 6 hours
|
15
|
+
ActiveSupport::TimeZone['Mexico City'].parse( str )
|
16
|
+
end
|
17
|
+
|
18
|
+
end # class Time
|
19
|
+
|
@@ -45,7 +45,7 @@ module TextUtils
|
|
45
45
|
|
46
46
|
def strip_whitespaces( title )
|
47
47
|
# remove all whitespace and punctuation
|
48
|
-
title.gsub( /[ \t_\-\.!()\[\]'"
|
48
|
+
title.gsub( /[ \t_\-\.!()\[\]'"’\/]/, '' )
|
49
49
|
end
|
50
50
|
|
51
51
|
def strip_special_chars( title )
|
@@ -81,21 +81,28 @@ module TextUtils
|
|
81
81
|
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
82
82
|
|
83
83
|
## todo: add unicode codepoint name
|
84
|
-
|
84
|
+
|
85
|
+
|
85
86
|
alternatives = [
|
86
87
|
['ß', 'ss'],
|
88
|
+
|
87
89
|
['æ', 'ae'],
|
88
90
|
['ä', 'ae'],
|
89
|
-
['ā', 'a' ], # e.g. Liepājas
|
91
|
+
['ā', 'a' ], # e.g. Liepājas, Kāṭhmāḍaũ
|
90
92
|
['á', 'a' ], # e.g. Bogotá, Králové
|
93
|
+
['à', 'a' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
|
91
94
|
['ã', 'a' ], # e.g São Paulo
|
92
95
|
['ă', 'a' ], # e.g. Chișinău
|
93
96
|
['â', 'a' ], # e.g Goiânia
|
94
97
|
['å', 'a' ], # e.g. Vålerenga
|
95
98
|
['ą', 'a' ], # e.g. Śląsk
|
99
|
+
|
96
100
|
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
97
101
|
['ć', 'c' ], # e.g. Budućnost
|
98
102
|
['č', 'c' ], # e.g. Tradiční, Výčepní
|
103
|
+
|
104
|
+
['ḍ', 'd' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
|
105
|
+
|
99
106
|
['é', 'e' ], # e.g. Vélez, Králové
|
100
107
|
['è', 'e' ], # e.g. Rivières
|
101
108
|
['ê', 'e' ], # e.g. Grêmio
|
@@ -103,31 +110,53 @@ module TextUtils
|
|
103
110
|
['ĕ', 'e' ], # e.g. Svĕtlý
|
104
111
|
['ė', 'e' ], # e.g. Vėtra
|
105
112
|
['ë', 'e' ], # e.g. Skënderbeu
|
113
|
+
|
106
114
|
['ğ', 'g' ], # e.g. Qarabağ
|
115
|
+
|
116
|
+
['ḥ', 'h' ], # e.g. Ad-Dawḥah [Doha]
|
117
|
+
|
107
118
|
['ì', 'i' ], # e.g. Potosì
|
108
119
|
['í', 'i' ], # e.g. Ústí
|
109
120
|
['ï', 'i' ], # e.g. El Djazaïr
|
121
|
+
['ī', 'i' ], # e.g. Al-Iskandarīyah [Alexandria]
|
122
|
+
|
110
123
|
['ł', 'l' ], # e.g. Wisła, Wrocław
|
111
124
|
['ñ', 'n' ], # e.g. Porteño
|
112
125
|
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
126
|
+
|
113
127
|
['ö', 'oe'],
|
114
128
|
['ő', 'o' ], # e.g. Győri
|
115
129
|
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
116
130
|
['õ', 'o' ], # e.g. Nõmme
|
117
131
|
['ô', 'o' ], # e.g. Amazônia (pt)
|
118
|
-
['ō', 'o' ], # e.g.
|
132
|
+
['ō', 'o' ], # e.g. Tōkyō, Pishōr
|
133
|
+
['ŏ', 'o' ], # e.g. P'yŏngyang [Pyongyang]
|
119
134
|
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
135
|
+
['ố', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
|
136
|
+
['ồ', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
|
137
|
+
['ộ', 'o' ], # e.g. Hà Nội [Hanoi]
|
138
|
+
|
120
139
|
['ř', 'r' ], # e.g. Třeboň
|
140
|
+
|
121
141
|
['ș', 's' ], # e.g. Chișinău, București
|
122
142
|
['ş', 's' ], # e.g. Beşiktaş
|
123
143
|
['š', 's' ], # e.g. Košice
|
144
|
+
['ṣ', 's' ], # e.g. Al-Mawṣil [Mosul]
|
145
|
+
|
124
146
|
['ť', 't' ], # e.g. Měšťan
|
147
|
+
['ṭ', 't' ], # e.g. Al-Kharṭūm [Khartoum], Kāṭhmāḍaũ
|
148
|
+
|
125
149
|
['ü', 'ue'],
|
126
150
|
['ú', 'u' ], # e.g. Fútbol
|
127
151
|
['ù', 'u' ], # e.g. Xyauyù (it)
|
128
152
|
['ū', 'u' ], # e.g. Sūduva
|
129
153
|
['ů', 'u' ], # e.g. Sládkův
|
154
|
+
['ũ', 'u' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
|
155
|
+
|
130
156
|
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
157
|
+
|
158
|
+
['x̌', 'x'], # e.g. Pex̌awar [Peshawar]
|
159
|
+
|
131
160
|
['ý', 'y' ], # e.g. Nefitrovaný
|
132
161
|
['ź', 'z' ], # e.g. Łódź
|
133
162
|
['ž', 'z' ], # e.g. Domžale, Petržalka
|
@@ -135,15 +164,25 @@ module TextUtils
|
|
135
164
|
|
136
165
|
['Á', 'a' ], # e.g. Águila (es)
|
137
166
|
['Č', 'c' ], # e.g. České
|
167
|
+
|
168
|
+
['Ḥ', 'h' ], # e.g. Ḥalab [Aleppo]
|
169
|
+
['Ḫ', 'h' ], # e.g. Ḫamīs Mušayṭ
|
138
170
|
['İ', 'i' ], # e.g. İnter
|
139
171
|
['Í', 'i' ], # e.g. ÍBV
|
140
172
|
['Ł', 'l' ], # e.g. Łódź
|
173
|
+
|
141
174
|
['Ö', 'oe' ], # e.g. Örebro
|
175
|
+
['Ō', 'o' ], # e.g. Ōsaka [Osaka]
|
142
176
|
['Ø', 'o' ], # e.g. Nogne Ø Imperial Stout (no)
|
177
|
+
|
143
178
|
['Ř', 'r' ], # e.g. Řezák
|
179
|
+
|
144
180
|
['Ś', 's' ], # e.g. Śląsk
|
145
181
|
['Š', 's' ], # e.g. MŠK
|
146
182
|
['Ş', 's' ], # e.g. Şüvälan
|
183
|
+
['Ṣ', 's' ], # e.g. Ṣan'ā' [Sana'a]
|
184
|
+
|
185
|
+
['Ṭ', 't' ], # e.g. Ṭarābulus [Tripoli]
|
147
186
|
['Ú', 'u' ], # e.g. Ústí, Újpest
|
148
187
|
['Ž', 'z' ], # e.g. Žilina
|
149
188
|
['Ż', 'z' ] # e.g. Żywiec (polish)
|
data/lib/textutils/utils.rb
CHANGED
@@ -9,21 +9,6 @@ end
|
|
9
9
|
|
10
10
|
|
11
11
|
|
12
|
-
class File
|
13
|
-
def self.read_utf8( path )
|
14
|
-
text = open( path, 'r:bom|utf-8' ) do |file|
|
15
|
-
file.read
|
16
|
-
end
|
17
|
-
|
18
|
-
# NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
|
19
|
-
text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
|
20
|
-
|
21
|
-
text
|
22
|
-
end
|
23
|
-
end # class File
|
24
|
-
|
25
|
-
|
26
|
-
|
27
12
|
def title_esc_regex( title_unescaped )
|
28
13
|
puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
|
29
14
|
TextUtils.title_esc_regex( title_unescaped )
|
data/lib/textutils/version.rb
CHANGED
data/test/test_title_helper.rb
CHANGED
@@ -38,7 +38,37 @@ class TestTitleHelper < MiniTest::Unit::TestCase
|
|
38
38
|
[ '$Alianz$ Arena', 'alianzarena' ],
|
39
39
|
[ 'Arena Amazônia', 'arenaamazonia' ],
|
40
40
|
[ 'Tōkyō [Tokyo]', 'tokyo' ],
|
41
|
-
[ '
|
41
|
+
[ 'Ōsaka [Osaka]', 'osaka' ],
|
42
|
+
[ 'El Djazaïr [Algiers]', 'eldjazair' ],
|
43
|
+
[ 'Al-Kharṭūm [Khartoum]', 'alkhartum' ],
|
44
|
+
[ 'Ṭarābulus [Tripoli]', 'tarabulus' ],
|
45
|
+
[ 'Al-Iskandarīyah [Alexandria]', 'aliskandariyah' ],
|
46
|
+
[ 'Pex̌awar', 'pexawar'],
|
47
|
+
[ 'Pishōr', 'pishor' ],
|
48
|
+
[ 'Pishāwar', 'pishawar' ],
|
49
|
+
[ 'Islām ābād', 'islamabad' ],
|
50
|
+
[ 'Thành Phố Hồ Chí Minh [Saigon]', 'thanhphohochiminh' ],
|
51
|
+
[ 'Hà Nội [Hanoi]', 'hanoi' ],
|
52
|
+
[ 'Donets’k', 'donetsk' ],
|
53
|
+
[ 'Baghdād [Baghdad]', 'baghdad'],
|
54
|
+
[ 'Al-Mawṣil [Mosul]', 'almawsil'],
|
55
|
+
[ 'Al-Baṣrah [Basra]', 'albasrah'],
|
56
|
+
[ 'Arbīl [Erbil]', 'arbil' ],
|
57
|
+
[ 'Kirkūk [Kirkuk]', 'kirkuk' ],
|
58
|
+
[ 'Tehrān [Tehran]', 'tehran' ],
|
59
|
+
[ 'Eṣfahān [Isfahan]', 'esfahan' ],
|
60
|
+
[ 'Shīrāz [Shiraz]', 'shiraz' ],
|
61
|
+
[ 'Tabrīz [Tabriz]', 'tabriz' ],
|
62
|
+
[ 'Ahvāz [Ahvaz]', 'ahvaz' ],
|
63
|
+
[ 'Ad-Dawḥah [Doha]', 'addawhah'],
|
64
|
+
[ 'Ḥalab [Aleppo]', 'halab'],
|
65
|
+
[ 'Al-Madīnah [Medina]', 'almadinah'],
|
66
|
+
[ 'Ad-Dammām [Dammam]', 'addammam' ],
|
67
|
+
[ 'Aṭ-Ṭā’if', 'attaif'],
|
68
|
+
[ 'Ḫamīs Mušayṭ', 'hamismusayt'],
|
69
|
+
[ "Ṣan'ā' [Sana'a]", 'sana'],
|
70
|
+
[ "P'yŏngyang [Pyongyang]", 'pyongyang' ],
|
71
|
+
[ 'Kāṭhmāḍaũ [Kathmandu]', 'kathmadau' ]
|
42
72
|
]
|
43
73
|
|
44
74
|
txt_io.each do |txt|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2014-03-24 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: props
|
16
|
-
requirement: &
|
16
|
+
requirement: &73970010 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73970010
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: logutils
|
27
|
-
requirement: &
|
27
|
+
requirement: &73969260 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.5'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73969260
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activesupport
|
38
|
-
requirement: &
|
38
|
+
requirement: &73969070 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *73969070
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rdoc
|
49
|
-
requirement: &
|
49
|
+
requirement: &73968810 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '3.10'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *73968810
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: hoe
|
60
|
-
requirement: &
|
60
|
+
requirement: &73953840 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '3.3'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *73953840
|
69
69
|
description: textutils - Text Filters, Helpers, Readers and More
|
70
70
|
email: ruby-talk@ruby-lang.org
|
71
71
|
executables: []
|
@@ -79,6 +79,8 @@ files:
|
|
79
79
|
- Rakefile
|
80
80
|
- lib/textutils.rb
|
81
81
|
- lib/textutils/classifier.rb
|
82
|
+
- lib/textutils/core_ext/file.rb
|
83
|
+
- lib/textutils/core_ext/time.rb
|
82
84
|
- lib/textutils/filter/code_filter.rb
|
83
85
|
- lib/textutils/filter/comment_filter.rb
|
84
86
|
- lib/textutils/filter/erb_django_filter.rb
|