textutils 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -4,6 +4,8 @@ README.md
4
4
  Rakefile
5
5
  lib/textutils.rb
6
6
  lib/textutils/classifier.rb
7
+ lib/textutils/core_ext/file.rb
8
+ lib/textutils/core_ext/time.rb
7
9
  lib/textutils/filter/code_filter.rb
8
10
  lib/textutils/filter/comment_filter.rb
9
11
  lib/textutils/filter/erb_django_filter.rb
data/lib/textutils.rb CHANGED
@@ -42,6 +42,9 @@ require 'textutils/helper/address_helper'
42
42
  require 'textutils/helper/value_helper'
43
43
 
44
44
  require 'textutils/utils'
45
+ require 'textutils/core_ext/file'
46
+ require 'textutils/core_ext/time'
47
+
45
48
  require 'textutils/reader/code_reader'
46
49
  require 'textutils/reader/hash_reader'
47
50
  require 'textutils/reader/hash_reader_v2'
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ class File
4
+ def self.read_utf8( path )
5
+ text = open( path, 'r:bom|utf-8' ) do |file|
6
+ file.read
7
+ end
8
+
9
+ # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
10
+ text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
11
+
12
+ text
13
+ end
14
+ end # class File
15
+
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class Time
5
+
6
+ def self.cet( str ) # central european time (cet) + central european summer time (cest)
7
+ ActiveSupport::TimeZone['Vienna'].parse( str )
8
+ end
9
+
10
+ def self.eet( str ) # eastern european time (eet) + 2 hours
11
+ ActiveSupport::TimeZone['Bucharest'].parse( str )
12
+ end
13
+
14
+ def self.cst( str ) # central standard time (cst) - 6 hours
15
+ ActiveSupport::TimeZone['Mexico City'].parse( str )
16
+ end
17
+
18
+ end # class Time
19
+
@@ -45,7 +45,7 @@ module TextUtils
45
45
 
46
46
  def strip_whitespaces( title )
47
47
  # remove all whitespace and punctuation
48
- title.gsub( /[ \t_\-\.!()\[\]'"\/]/, '' )
48
+ title.gsub( /[ \t_\-\.!()\[\]'"’\/]/, '' )
49
49
  end
50
50
 
51
51
  def strip_special_chars( title )
@@ -81,21 +81,28 @@ module TextUtils
81
81
  ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
82
82
 
83
83
  ## todo: add unicode codepoint name
84
-
84
+
85
+
85
86
  alternatives = [
86
87
  ['ß', 'ss'],
88
+
87
89
  ['æ', 'ae'],
88
90
  ['ä', 'ae'],
89
- ['ā', 'a' ], # e.g. Liepājas
91
+ ['ā', 'a' ], # e.g. Liepājas, Kāṭhmāḍaũ
90
92
  ['á', 'a' ], # e.g. Bogotá, Králové
93
+ ['à', 'a' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
91
94
  ['ã', 'a' ], # e.g São Paulo
92
95
  ['ă', 'a' ], # e.g. Chișinău
93
96
  ['â', 'a' ], # e.g Goiânia
94
97
  ['å', 'a' ], # e.g. Vålerenga
95
98
  ['ą', 'a' ], # e.g. Śląsk
99
+
96
100
  ['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
97
101
  ['ć', 'c' ], # e.g. Budućnost
98
102
  ['č', 'c' ], # e.g. Tradiční, Výčepní
103
+
104
+ ['ḍ', 'd' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
105
+
99
106
  ['é', 'e' ], # e.g. Vélez, Králové
100
107
  ['è', 'e' ], # e.g. Rivières
101
108
  ['ê', 'e' ], # e.g. Grêmio
@@ -103,31 +110,53 @@ module TextUtils
103
110
  ['ĕ', 'e' ], # e.g. Svĕtlý
104
111
  ['ė', 'e' ], # e.g. Vėtra
105
112
  ['ë', 'e' ], # e.g. Skënderbeu
113
+
106
114
  ['ğ', 'g' ], # e.g. Qarabağ
115
+
116
+ ['ḥ', 'h' ], # e.g. Ad-Dawḥah [Doha]
117
+
107
118
  ['ì', 'i' ], # e.g. Potosì
108
119
  ['í', 'i' ], # e.g. Ústí
109
120
  ['ï', 'i' ], # e.g. El Djazaïr
121
+ ['ī', 'i' ], # e.g. Al-Iskandarīyah [Alexandria]
122
+
110
123
  ['ł', 'l' ], # e.g. Wisła, Wrocław
111
124
  ['ñ', 'n' ], # e.g. Porteño
112
125
  ['ň', 'n' ], # e.g. Plzeň, Třeboň
126
+
113
127
  ['ö', 'oe'],
114
128
  ['ő', 'o' ], # e.g. Győri
115
129
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
116
130
  ['õ', 'o' ], # e.g. Nõmme
117
131
  ['ô', 'o' ], # e.g. Amazônia (pt)
118
- ['ō', 'o' ], # e.g. Tōkyō
132
+ ['ō', 'o' ], # e.g. Tōkyō, Pishōr
133
+ ['ŏ', 'o' ], # e.g. P'yŏngyang [Pyongyang]
119
134
  ['ø', 'o' ], # e.g. Fuglafjørdur, København
135
+ ['ố', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
136
+ ['ồ', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
137
+ ['ộ', 'o' ], # e.g. Hà Nội [Hanoi]
138
+
120
139
  ['ř', 'r' ], # e.g. Třeboň
140
+
121
141
  ['ș', 's' ], # e.g. Chișinău, București
122
142
  ['ş', 's' ], # e.g. Beşiktaş
123
143
  ['š', 's' ], # e.g. Košice
144
+ ['ṣ', 's' ], # e.g. Al-Mawṣil [Mosul]
145
+
124
146
  ['ť', 't' ], # e.g. Měšťan
147
+ ['ṭ', 't' ], # e.g. Al-Kharṭūm [Khartoum], Kāṭhmāḍaũ
148
+
125
149
  ['ü', 'ue'],
126
150
  ['ú', 'u' ], # e.g. Fútbol
127
151
  ['ù', 'u' ], # e.g. Xyauyù (it)
128
152
  ['ū', 'u' ], # e.g. Sūduva
129
153
  ['ů', 'u' ], # e.g. Sládkův
154
+ ['ũ', 'u' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
155
+
130
156
  ['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
157
+
158
+ ['x̌', 'x'], # e.g. Pex̌awar [Peshawar]
159
+
131
160
  ['ý', 'y' ], # e.g. Nefitrovaný
132
161
  ['ź', 'z' ], # e.g. Łódź
133
162
  ['ž', 'z' ], # e.g. Domžale, Petržalka
@@ -135,15 +164,25 @@ module TextUtils
135
164
 
136
165
  ['Á', 'a' ], # e.g. Águila (es)
137
166
  ['Č', 'c' ], # e.g. České
167
+
168
+ ['Ḥ', 'h' ], # e.g. Ḥalab [Aleppo]
169
+ ['Ḫ', 'h' ], # e.g. Ḫamīs Mušayṭ
138
170
  ['İ', 'i' ], # e.g. İnter
139
171
  ['Í', 'i' ], # e.g. ÍBV
140
172
  ['Ł', 'l' ], # e.g. Łódź
173
+
141
174
  ['Ö', 'oe' ], # e.g. Örebro
175
+ ['Ō', 'o' ], # e.g. Ōsaka [Osaka]
142
176
  ['Ø', 'o' ], # e.g. Nogne Ø Imperial Stout (no)
177
+
143
178
  ['Ř', 'r' ], # e.g. Řezák
179
+
144
180
  ['Ś', 's' ], # e.g. Śląsk
145
181
  ['Š', 's' ], # e.g. MŠK
146
182
  ['Ş', 's' ], # e.g. Şüvälan
183
+ ['Ṣ', 's' ], # e.g. Ṣan'ā' [Sana'a]
184
+
185
+ ['Ṭ', 't' ], # e.g. Ṭarābulus [Tripoli]
147
186
  ['Ú', 'u' ], # e.g. Ústí, Újpest
148
187
  ['Ž', 'z' ], # e.g. Žilina
149
188
  ['Ż', 'z' ] # e.g. Żywiec (polish)
@@ -9,21 +9,6 @@ end
9
9
 
10
10
 
11
11
 
12
- class File
13
- def self.read_utf8( path )
14
- text = open( path, 'r:bom|utf-8' ) do |file|
15
- file.read
16
- end
17
-
18
- # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
19
- text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
20
-
21
- text
22
- end
23
- end # class File
24
-
25
-
26
-
27
12
  def title_esc_regex( title_unescaped )
28
13
  puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
29
14
  TextUtils.title_esc_regex( title_unescaped )
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.9.0'
4
+ VERSION = '0.9.1'
5
5
 
6
6
  end # module TextUtils
@@ -38,7 +38,37 @@ class TestTitleHelper < MiniTest::Unit::TestCase
38
38
  [ '$Alianz$ Arena', 'alianzarena' ],
39
39
  [ 'Arena Amazônia', 'arenaamazonia' ],
40
40
  [ 'Tōkyō [Tokyo]', 'tokyo' ],
41
- [ 'El Djazaïr [Algiers]', 'eldjazair' ]
41
+ [ 'Ōsaka [Osaka]', 'osaka' ],
42
+ [ 'El Djazaïr [Algiers]', 'eldjazair' ],
43
+ [ 'Al-Kharṭūm [Khartoum]', 'alkhartum' ],
44
+ [ 'Ṭarābulus [Tripoli]', 'tarabulus' ],
45
+ [ 'Al-Iskandarīyah [Alexandria]', 'aliskandariyah' ],
46
+ [ 'Pex̌awar', 'pexawar'],
47
+ [ 'Pishōr', 'pishor' ],
48
+ [ 'Pishāwar', 'pishawar' ],
49
+ [ 'Islām ābād', 'islamabad' ],
50
+ [ 'Thành Phố Hồ Chí Minh [Saigon]', 'thanhphohochiminh' ],
51
+ [ 'Hà Nội [Hanoi]', 'hanoi' ],
52
+ [ 'Donets’k', 'donetsk' ],
53
+ [ 'Baghdād [Baghdad]', 'baghdad'],
54
+ [ 'Al-Mawṣil [Mosul]', 'almawsil'],
55
+ [ 'Al-Baṣrah [Basra]', 'albasrah'],
56
+ [ 'Arbīl [Erbil]', 'arbil' ],
57
+ [ 'Kirkūk [Kirkuk]', 'kirkuk' ],
58
+ [ 'Tehrān [Tehran]', 'tehran' ],
59
+ [ 'Eṣfahān [Isfahan]', 'esfahan' ],
60
+ [ 'Shīrāz [Shiraz]', 'shiraz' ],
61
+ [ 'Tabrīz [Tabriz]', 'tabriz' ],
62
+ [ 'Ahvāz [Ahvaz]', 'ahvaz' ],
63
+ [ 'Ad-Dawḥah [Doha]', 'addawhah'],
64
+ [ 'Ḥalab [Aleppo]', 'halab'],
65
+ [ 'Al-Madīnah [Medina]', 'almadinah'],
66
+ [ 'Ad-Dammām [Dammam]', 'addammam' ],
67
+ [ 'Aṭ-Ṭā’if', 'attaif'],
68
+ [ 'Ḫamīs Mušayṭ', 'hamismusayt'],
69
+ [ "Ṣan'ā' [Sana'a]", 'sana'],
70
+ [ "P'yŏngyang [Pyongyang]", 'pyongyang' ],
71
+ [ 'Kāṭhmāḍaũ [Kathmandu]', 'kathmadau' ]
42
72
  ]
43
73
 
44
74
  txt_io.each do |txt|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2014-03-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: props
16
- requirement: &76193510 !ruby/object:Gem::Requirement
16
+ requirement: &73970010 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *76193510
24
+ version_requirements: *73970010
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: logutils
27
- requirement: &76193110 !ruby/object:Gem::Requirement
27
+ requirement: &73969260 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.5'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *76193110
35
+ version_requirements: *73969260
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &76192860 !ruby/object:Gem::Requirement
38
+ requirement: &73969070 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *76192860
46
+ version_requirements: *73969070
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rdoc
49
- requirement: &76192430 !ruby/object:Gem::Requirement
49
+ requirement: &73968810 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '3.10'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *76192430
57
+ version_requirements: *73968810
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: hoe
60
- requirement: &76191590 !ruby/object:Gem::Requirement
60
+ requirement: &73953840 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '3.3'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *76191590
68
+ version_requirements: *73953840
69
69
  description: textutils - Text Filters, Helpers, Readers and More
70
70
  email: ruby-talk@ruby-lang.org
71
71
  executables: []
@@ -79,6 +79,8 @@ files:
79
79
  - Rakefile
80
80
  - lib/textutils.rb
81
81
  - lib/textutils/classifier.rb
82
+ - lib/textutils/core_ext/file.rb
83
+ - lib/textutils/core_ext/time.rb
82
84
  - lib/textutils/filter/code_filter.rb
83
85
  - lib/textutils/filter/comment_filter.rb
84
86
  - lib/textutils/filter/erb_django_filter.rb