textutils 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -4,6 +4,8 @@ README.md
4
4
  Rakefile
5
5
  lib/textutils.rb
6
6
  lib/textutils/classifier.rb
7
+ lib/textutils/core_ext/file.rb
8
+ lib/textutils/core_ext/time.rb
7
9
  lib/textutils/filter/code_filter.rb
8
10
  lib/textutils/filter/comment_filter.rb
9
11
  lib/textutils/filter/erb_django_filter.rb
data/lib/textutils.rb CHANGED
@@ -42,6 +42,9 @@ require 'textutils/helper/address_helper'
42
42
  require 'textutils/helper/value_helper'
43
43
 
44
44
  require 'textutils/utils'
45
+ require 'textutils/core_ext/file'
46
+ require 'textutils/core_ext/time'
47
+
45
48
  require 'textutils/reader/code_reader'
46
49
  require 'textutils/reader/hash_reader'
47
50
  require 'textutils/reader/hash_reader_v2'
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ class File
4
+ def self.read_utf8( path )
5
+ text = open( path, 'r:bom|utf-8' ) do |file|
6
+ file.read
7
+ end
8
+
9
+ # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
10
+ text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
11
+
12
+ text
13
+ end
14
+ end # class File
15
+
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class Time
5
+
6
+ def self.cet( str ) # central european time (cet) + central european summer time (cest)
7
+ ActiveSupport::TimeZone['Vienna'].parse( str )
8
+ end
9
+
10
+ def self.eet( str ) # eastern european time (eet) + 2 hours
11
+ ActiveSupport::TimeZone['Bucharest'].parse( str )
12
+ end
13
+
14
+ def self.cst( str ) # central standard time (cst) - 6 hours
15
+ ActiveSupport::TimeZone['Mexico City'].parse( str )
16
+ end
17
+
18
+ end # class Time
19
+
@@ -45,7 +45,7 @@ module TextUtils
45
45
 
46
46
  def strip_whitespaces( title )
47
47
  # remove all whitespace and punctuation
48
- title.gsub( /[ \t_\-\.!()\[\]'"\/]/, '' )
48
+ title.gsub( /[ \t_\-\.!()\[\]'"’\/]/, '' )
49
49
  end
50
50
 
51
51
  def strip_special_chars( title )
@@ -81,21 +81,28 @@ module TextUtils
81
81
  ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
82
82
 
83
83
  ## todo: add unicode codepoint name
84
-
84
+
85
+
85
86
  alternatives = [
86
87
  ['ß', 'ss'],
88
+
87
89
  ['æ', 'ae'],
88
90
  ['ä', 'ae'],
89
- ['ā', 'a' ], # e.g. Liepājas
91
+ ['ā', 'a' ], # e.g. Liepājas, Kāṭhmāḍaũ
90
92
  ['á', 'a' ], # e.g. Bogotá, Králové
93
+ ['à', 'a' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
91
94
  ['ã', 'a' ], # e.g São Paulo
92
95
  ['ă', 'a' ], # e.g. Chișinău
93
96
  ['â', 'a' ], # e.g Goiânia
94
97
  ['å', 'a' ], # e.g. Vålerenga
95
98
  ['ą', 'a' ], # e.g. Śląsk
99
+
96
100
  ['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
97
101
  ['ć', 'c' ], # e.g. Budućnost
98
102
  ['č', 'c' ], # e.g. Tradiční, Výčepní
103
+
104
+ ['ḍ', 'd' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
105
+
99
106
  ['é', 'e' ], # e.g. Vélez, Králové
100
107
  ['è', 'e' ], # e.g. Rivières
101
108
  ['ê', 'e' ], # e.g. Grêmio
@@ -103,31 +110,53 @@ module TextUtils
103
110
  ['ĕ', 'e' ], # e.g. Svĕtlý
104
111
  ['ė', 'e' ], # e.g. Vėtra
105
112
  ['ë', 'e' ], # e.g. Skënderbeu
113
+
106
114
  ['ğ', 'g' ], # e.g. Qarabağ
115
+
116
+ ['ḥ', 'h' ], # e.g. Ad-Dawḥah [Doha]
117
+
107
118
  ['ì', 'i' ], # e.g. Potosì
108
119
  ['í', 'i' ], # e.g. Ústí
109
120
  ['ï', 'i' ], # e.g. El Djazaïr
121
+ ['ī', 'i' ], # e.g. Al-Iskandarīyah [Alexandria]
122
+
110
123
  ['ł', 'l' ], # e.g. Wisła, Wrocław
111
124
  ['ñ', 'n' ], # e.g. Porteño
112
125
  ['ň', 'n' ], # e.g. Plzeň, Třeboň
126
+
113
127
  ['ö', 'oe'],
114
128
  ['ő', 'o' ], # e.g. Győri
115
129
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
116
130
  ['õ', 'o' ], # e.g. Nõmme
117
131
  ['ô', 'o' ], # e.g. Amazônia (pt)
118
- ['ō', 'o' ], # e.g. Tōkyō
132
+ ['ō', 'o' ], # e.g. Tōkyō, Pishōr
133
+ ['ŏ', 'o' ], # e.g. P'yŏngyang [Pyongyang]
119
134
  ['ø', 'o' ], # e.g. Fuglafjørdur, København
135
+ ['ố', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
136
+ ['ồ', 'o' ], # e.g. Thành Phố Hồ Chí Minh [Saigon]
137
+ ['ộ', 'o' ], # e.g. Hà Nội [Hanoi]
138
+
120
139
  ['ř', 'r' ], # e.g. Třeboň
140
+
121
141
  ['ș', 's' ], # e.g. Chișinău, București
122
142
  ['ş', 's' ], # e.g. Beşiktaş
123
143
  ['š', 's' ], # e.g. Košice
144
+ ['ṣ', 's' ], # e.g. Al-Mawṣil [Mosul]
145
+
124
146
  ['ť', 't' ], # e.g. Měšťan
147
+ ['ṭ', 't' ], # e.g. Al-Kharṭūm [Khartoum], Kāṭhmāḍaũ
148
+
125
149
  ['ü', 'ue'],
126
150
  ['ú', 'u' ], # e.g. Fútbol
127
151
  ['ù', 'u' ], # e.g. Xyauyù (it)
128
152
  ['ū', 'u' ], # e.g. Sūduva
129
153
  ['ů', 'u' ], # e.g. Sládkův
154
+ ['ũ', 'u' ], # e.g. Kāṭhmāḍaũ [Kathmandu]
155
+
130
156
  ['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
157
+
158
+ ['x̌', 'x'], # e.g. Pex̌awar [Peshawar]
159
+
131
160
  ['ý', 'y' ], # e.g. Nefitrovaný
132
161
  ['ź', 'z' ], # e.g. Łódź
133
162
  ['ž', 'z' ], # e.g. Domžale, Petržalka
@@ -135,15 +164,25 @@ module TextUtils
135
164
 
136
165
  ['Á', 'a' ], # e.g. Águila (es)
137
166
  ['Č', 'c' ], # e.g. České
167
+
168
+ ['Ḥ', 'h' ], # e.g. Ḥalab [Aleppo]
169
+ ['Ḫ', 'h' ], # e.g. Ḫamīs Mušayṭ
138
170
  ['İ', 'i' ], # e.g. İnter
139
171
  ['Í', 'i' ], # e.g. ÍBV
140
172
  ['Ł', 'l' ], # e.g. Łódź
173
+
141
174
  ['Ö', 'oe' ], # e.g. Örebro
175
+ ['Ō', 'o' ], # e.g. Ōsaka [Osaka]
142
176
  ['Ø', 'o' ], # e.g. Nogne Ø Imperial Stout (no)
177
+
143
178
  ['Ř', 'r' ], # e.g. Řezák
179
+
144
180
  ['Ś', 's' ], # e.g. Śląsk
145
181
  ['Š', 's' ], # e.g. MŠK
146
182
  ['Ş', 's' ], # e.g. Şüvälan
183
+ ['Ṣ', 's' ], # e.g. Ṣan'ā' [Sana'a]
184
+
185
+ ['Ṭ', 't' ], # e.g. Ṭarābulus [Tripoli]
147
186
  ['Ú', 'u' ], # e.g. Ústí, Újpest
148
187
  ['Ž', 'z' ], # e.g. Žilina
149
188
  ['Ż', 'z' ] # e.g. Żywiec (polish)
@@ -9,21 +9,6 @@ end
9
9
 
10
10
 
11
11
 
12
- class File
13
- def self.read_utf8( path )
14
- text = open( path, 'r:bom|utf-8' ) do |file|
15
- file.read
16
- end
17
-
18
- # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
19
- text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
20
-
21
- text
22
- end
23
- end # class File
24
-
25
-
26
-
27
12
  def title_esc_regex( title_unescaped )
28
13
  puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
29
14
  TextUtils.title_esc_regex( title_unescaped )
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.9.0'
4
+ VERSION = '0.9.1'
5
5
 
6
6
  end # module TextUtils
@@ -38,7 +38,37 @@ class TestTitleHelper < MiniTest::Unit::TestCase
38
38
  [ '$Alianz$ Arena', 'alianzarena' ],
39
39
  [ 'Arena Amazônia', 'arenaamazonia' ],
40
40
  [ 'Tōkyō [Tokyo]', 'tokyo' ],
41
- [ 'El Djazaïr [Algiers]', 'eldjazair' ]
41
+ [ 'Ōsaka [Osaka]', 'osaka' ],
42
+ [ 'El Djazaïr [Algiers]', 'eldjazair' ],
43
+ [ 'Al-Kharṭūm [Khartoum]', 'alkhartum' ],
44
+ [ 'Ṭarābulus [Tripoli]', 'tarabulus' ],
45
+ [ 'Al-Iskandarīyah [Alexandria]', 'aliskandariyah' ],
46
+ [ 'Pex̌awar', 'pexawar'],
47
+ [ 'Pishōr', 'pishor' ],
48
+ [ 'Pishāwar', 'pishawar' ],
49
+ [ 'Islām ābād', 'islamabad' ],
50
+ [ 'Thành Phố Hồ Chí Minh [Saigon]', 'thanhphohochiminh' ],
51
+ [ 'Hà Nội [Hanoi]', 'hanoi' ],
52
+ [ 'Donets’k', 'donetsk' ],
53
+ [ 'Baghdād [Baghdad]', 'baghdad'],
54
+ [ 'Al-Mawṣil [Mosul]', 'almawsil'],
55
+ [ 'Al-Baṣrah [Basra]', 'albasrah'],
56
+ [ 'Arbīl [Erbil]', 'arbil' ],
57
+ [ 'Kirkūk [Kirkuk]', 'kirkuk' ],
58
+ [ 'Tehrān [Tehran]', 'tehran' ],
59
+ [ 'Eṣfahān [Isfahan]', 'esfahan' ],
60
+ [ 'Shīrāz [Shiraz]', 'shiraz' ],
61
+ [ 'Tabrīz [Tabriz]', 'tabriz' ],
62
+ [ 'Ahvāz [Ahvaz]', 'ahvaz' ],
63
+ [ 'Ad-Dawḥah [Doha]', 'addawhah'],
64
+ [ 'Ḥalab [Aleppo]', 'halab'],
65
+ [ 'Al-Madīnah [Medina]', 'almadinah'],
66
+ [ 'Ad-Dammām [Dammam]', 'addammam' ],
67
+ [ 'Aṭ-Ṭā’if', 'attaif'],
68
+ [ 'Ḫamīs Mušayṭ', 'hamismusayt'],
69
+ [ "Ṣan'ā' [Sana'a]", 'sana'],
70
+ [ "P'yŏngyang [Pyongyang]", 'pyongyang' ],
71
+ [ 'Kāṭhmāḍaũ [Kathmandu]', 'kathmadau' ]
42
72
  ]
43
73
 
44
74
  txt_io.each do |txt|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2014-03-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: props
16
- requirement: &76193510 !ruby/object:Gem::Requirement
16
+ requirement: &73970010 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *76193510
24
+ version_requirements: *73970010
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: logutils
27
- requirement: &76193110 !ruby/object:Gem::Requirement
27
+ requirement: &73969260 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.5'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *76193110
35
+ version_requirements: *73969260
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &76192860 !ruby/object:Gem::Requirement
38
+ requirement: &73969070 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *76192860
46
+ version_requirements: *73969070
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rdoc
49
- requirement: &76192430 !ruby/object:Gem::Requirement
49
+ requirement: &73968810 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '3.10'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *76192430
57
+ version_requirements: *73968810
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: hoe
60
- requirement: &76191590 !ruby/object:Gem::Requirement
60
+ requirement: &73953840 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '3.3'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *76191590
68
+ version_requirements: *73953840
69
69
  description: textutils - Text Filters, Helpers, Readers and More
70
70
  email: ruby-talk@ruby-lang.org
71
71
  executables: []
@@ -79,6 +79,8 @@ files:
79
79
  - Rakefile
80
80
  - lib/textutils.rb
81
81
  - lib/textutils/classifier.rb
82
+ - lib/textutils/core_ext/file.rb
83
+ - lib/textutils/core_ext/time.rb
82
84
  - lib/textutils/filter/code_filter.rb
83
85
  - lib/textutils/filter/comment_filter.rb
84
86
  - lib/textutils/filter/erb_django_filter.rb