textutils 0.5.10 → 0.5.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +2 -0
- data/lib/textutils/helper/address_helper.rb +29 -0
- data/lib/textutils/helper/title_helper.rb +42 -14
- data/lib/textutils/helper/value_helper.rb +38 -0
- data/lib/textutils/reader/values_reader.rb +0 -120
- data/lib/textutils/utils.rb +1 -0
- data/lib/textutils/version.rb +1 -1
- data/lib/textutils.rb +2 -0
- metadata +10 -8
data/Manifest.txt
CHANGED
@@ -7,8 +7,10 @@ lib/textutils/filter/code_filter.rb
|
|
7
7
|
lib/textutils/filter/comment_filter.rb
|
8
8
|
lib/textutils/filter/erb_django_filter.rb
|
9
9
|
lib/textutils/filter/erb_filter.rb
|
10
|
+
lib/textutils/helper/address_helper.rb
|
10
11
|
lib/textutils/helper/title_helper.rb
|
11
12
|
lib/textutils/helper/unicode_helper.rb
|
13
|
+
lib/textutils/helper/value_helper.rb
|
12
14
|
lib/textutils/reader/code_reader.rb
|
13
15
|
lib/textutils/reader/fixture_reader.rb
|
14
16
|
lib/textutils/reader/hash_reader.rb
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module AddressHelper
|
6
|
+
|
7
|
+
def normalize_address( old_address_line )
|
8
|
+
# for now only checks german 5-digit zip code
|
9
|
+
#
|
10
|
+
# e.g. Alte Plauener Straße 24 // 95028 Hof becomes
|
11
|
+
# 95028 Hof // Alte Plauener Straße 24
|
12
|
+
|
13
|
+
new_address_line = old_address_line # default - do nothing - just path through
|
14
|
+
|
15
|
+
lines = old_address_line.split( '//' )
|
16
|
+
|
17
|
+
if lines.size == 2 # two lines / check for switching lines
|
18
|
+
line1 = lines[0].strip
|
19
|
+
line2 = lines[1].strip
|
20
|
+
if line2 =~ /^[0-9]{5}\s/
|
21
|
+
new_address_line = "#{line2} // #{line1}" # swap - let line w/ 5-digit zip code go first
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
new_address_line
|
26
|
+
end
|
27
|
+
|
28
|
+
end # module AddressHelper
|
29
|
+
end # module TextUtils
|
@@ -4,29 +4,57 @@
|
|
4
4
|
module TextUtils
|
5
5
|
module TitleHelper
|
6
6
|
|
7
|
-
def
|
7
|
+
def strip_translations( title )
|
8
|
+
# remove optional english translation in square brackets ([])
|
9
|
+
# e.g. Wien [Vienna] => Wien
|
10
|
+
|
11
|
+
title.gsub( /\[.+\]/, '' )
|
12
|
+
end
|
13
|
+
|
14
|
+
def strip_subtitles( title )
|
15
|
+
# remove optional longer title part in ()
|
16
|
+
# e.g. Las Palmas (de Gran Canaria) => Las Palmas
|
17
|
+
# Palma (de Mallorca) => Palma
|
18
|
+
|
19
|
+
title.gsub( /\(.+\)/, '' )
|
20
|
+
end
|
21
|
+
|
22
|
+
def strip_tags( title ) # todo: use an alias or rename for better name ??
|
23
|
+
# remove optional longer title part in {}
|
24
|
+
# e.g. Ottakringer {Bio} => Ottakringer
|
25
|
+
# Ottakringer {Alkoholfrei} => Ottakringer
|
26
|
+
#
|
27
|
+
# todo: use for autotags? e.g. {Bio} => bio
|
28
|
+
|
29
|
+
title.gsub( /\{.+\}/, '' )
|
30
|
+
end
|
31
|
+
|
32
|
+
def strip_whitespaces( title )
|
33
|
+
# remove all whitespace and punctuation
|
34
|
+
title.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
35
|
+
end
|
36
|
+
|
37
|
+
def strip_special_chars( title )
|
38
|
+
# remove special chars (e.g. %°&)
|
39
|
+
title.gsub( /[%&°]/, '' )
|
40
|
+
end
|
8
41
|
|
9
|
-
|
42
|
+
def title_to_key( title )
|
10
43
|
|
44
|
+
## NB: used in/moved from readers/values_reader.rb
|
11
45
|
|
12
46
|
## NB: downcase does NOT work for accented chars (thus, include in alternatives)
|
13
47
|
key = title.downcase
|
14
48
|
|
15
|
-
|
16
|
-
key = key.gsub( /\[.+\]/, '' )
|
49
|
+
key = strip_translations( key )
|
17
50
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
## remove optional longer title part in {} e.g. Ottakringer {Bio} or {Alkoholfrei}
|
22
|
-
## todo: use for autotags? e.g. {Bio} => bio
|
23
|
-
key = key.gsub( /\{.+\}/, '' )
|
51
|
+
key = strip_subtitles( key )
|
52
|
+
|
53
|
+
key = strip_tags( key )
|
24
54
|
|
25
|
-
|
26
|
-
key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
55
|
+
key = strip_whitespaces( key )
|
27
56
|
|
28
|
-
|
29
|
-
key = key.gsub( /[%&°]/, '' )
|
57
|
+
key = strip_special_chars( key )
|
30
58
|
|
31
59
|
## turn accented char into ascii look alike if possible
|
32
60
|
##
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module ValueHelper
|
6
|
+
|
7
|
+
|
8
|
+
def is_region?( value )
|
9
|
+
# assume region code e.g. TX or N
|
10
|
+
value =~ /^[A-Z]{1,2}$/
|
11
|
+
end
|
12
|
+
|
13
|
+
def is_year?( value )
|
14
|
+
# founded/established year e.g. 1776
|
15
|
+
value =~ /^[0-9]{4}$/
|
16
|
+
end
|
17
|
+
|
18
|
+
def is_website?( value )
|
19
|
+
# check for url/internet address e.g. www.ottakringer.at
|
20
|
+
# - must start w/ www. or
|
21
|
+
# - must end w/ .com
|
22
|
+
#
|
23
|
+
# fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
|
24
|
+
value =~ /^www\.|\.com$/
|
25
|
+
end
|
26
|
+
|
27
|
+
def is_address?( value )
|
28
|
+
# if value includes // assume address e.g. 3970 Weitra // Sparkasseplatz 160
|
29
|
+
value =~ /\/{2}/
|
30
|
+
end
|
31
|
+
|
32
|
+
def is_taglist?( value )
|
33
|
+
value =~ /^[a-z0-9\|_ ]+$/
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
end # module ValueHelper
|
38
|
+
end # module TextUtils
|
@@ -171,124 +171,4 @@ class ValuesReader
|
|
171
171
|
end # method each_line
|
172
172
|
|
173
173
|
|
174
|
-
|
175
|
-
def each_line_old_single_line_records_only
|
176
|
-
|
177
|
-
@data.each_line do |line|
|
178
|
-
|
179
|
-
## allow alternative comment lines
|
180
|
-
## e.g. -- comment or
|
181
|
-
## % comment
|
182
|
-
## why? # might get used by markdown for marking headers, for example
|
183
|
-
|
184
|
-
## NB: for now alternative comment lines not allowed as end of line style e.g
|
185
|
-
## some data, more data -- comment here
|
186
|
-
|
187
|
-
if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
|
188
|
-
# skip komments and do NOT copy to result (keep comments secret!)
|
189
|
-
logger.debug 'skipping comment line'
|
190
|
-
next
|
191
|
-
end
|
192
|
-
|
193
|
-
if line =~ /^\s*$/
|
194
|
-
# kommentar oder leerzeile überspringen
|
195
|
-
logger.debug 'skipping blank line'
|
196
|
-
next
|
197
|
-
end
|
198
|
-
|
199
|
-
|
200
|
-
# pass 1) remove possible trailing eol comment
|
201
|
-
## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
|
202
|
-
## becomes -> nyc, New York
|
203
|
-
|
204
|
-
line = line.sub( /\s+#.+$/, '' )
|
205
|
-
|
206
|
-
# pass 2) remove leading and trailing whitespace
|
207
|
-
|
208
|
-
line = line.strip
|
209
|
-
|
210
|
-
### guard escaped commas (e.g. \,)
|
211
|
-
line = line.gsub( '\,', '@commma@' )
|
212
|
-
|
213
|
-
## use generic separator (allow us to configure separator)
|
214
|
-
line = line.gsub( ',', '@sep@')
|
215
|
-
|
216
|
-
## restore escaped commas (before split)
|
217
|
-
line = line.gsub( '@commma@', ',' )
|
218
|
-
|
219
|
-
|
220
|
-
logger.debug "line: >>#{line}<<"
|
221
|
-
|
222
|
-
values = line.split( '@sep@' )
|
223
|
-
|
224
|
-
# pass 1) remove leading and trailing whitespace for values
|
225
|
-
|
226
|
-
values = values.map { |value| value.strip }
|
227
|
-
|
228
|
-
##### todo remove support of comment column? (NB: must NOT include commas)
|
229
|
-
# pass 2) remove comment columns
|
230
|
-
|
231
|
-
values = values.select do |value|
|
232
|
-
if value =~ /^#/ ## start with # treat it as a comment column; e.g. remove it
|
233
|
-
logger.debug " removing column with value >>#{value}<<"
|
234
|
-
false
|
235
|
-
else
|
236
|
-
true
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
logger.debug " values: >>#{values.join('<< >>')}<<"
|
241
|
-
|
242
|
-
|
243
|
-
### todo/fix: allow check - do NOT allow mixed use of with key and w/o key
|
244
|
-
## either use keys or do NOT use keys; do NOT mix in a single fixture file
|
245
|
-
|
246
|
-
|
247
|
-
### support autogenerate key from first title value
|
248
|
-
|
249
|
-
# if it looks like a key (only a-z lower case allowed); assume it's a key
|
250
|
-
# - also allow . in keys e.g. world.quali.america, at.cup, etc.
|
251
|
-
# - also allow 0-9 in keys e.g. at.2, at.3.1, etc.
|
252
|
-
|
253
|
-
# fix/todo: add support for leading underscore _
|
254
|
-
# or allow keys starting w/ digits?
|
255
|
-
if values[0] =~ /^([a-z][a-z0-9.]*[a-z0-9]|[a-z])$/ # NB: key must start w/ a-z letter (NB: minimum one letter possible)
|
256
|
-
key_col = values[0]
|
257
|
-
title_col = values[1]
|
258
|
-
more_cols = values[2..-1]
|
259
|
-
else
|
260
|
-
key_col = '<auto>'
|
261
|
-
title_col = values[0]
|
262
|
-
more_cols = values[1..-1]
|
263
|
-
end
|
264
|
-
|
265
|
-
attribs = {}
|
266
|
-
|
267
|
-
## title (split of optional synonyms)
|
268
|
-
# e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
|
269
|
-
titles = title_col.split('|')
|
270
|
-
|
271
|
-
attribs[ :title ] = titles[0]
|
272
|
-
|
273
|
-
## add optional synonyms if present
|
274
|
-
attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
|
275
|
-
|
276
|
-
if key_col == '<auto>'
|
277
|
-
## autogenerate key from first title
|
278
|
-
key_col = TextUtils.title_to_key( titles[0] )
|
279
|
-
logger.debug " autogen key >#{key_col}< from title >#{titles[0]}<, textutils version #{TextUtils::VERSION}"
|
280
|
-
end
|
281
|
-
|
282
|
-
attribs[ :key ] = key_col
|
283
|
-
|
284
|
-
attribs = attribs.merge( @more_values ) # e.g. merge country_id and other defaults if present
|
285
|
-
|
286
|
-
yield( attribs, more_cols )
|
287
|
-
|
288
|
-
end # each lines
|
289
|
-
|
290
|
-
end # method each_line
|
291
|
-
|
292
|
-
|
293
|
-
|
294
174
|
end # class ValuesReader
|
data/lib/textutils/utils.rb
CHANGED
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
@@ -23,6 +23,8 @@ require 'textutils/filter/erb_filter'
|
|
23
23
|
|
24
24
|
require 'textutils/helper/unicode_helper'
|
25
25
|
require 'textutils/helper/title_helper'
|
26
|
+
require 'textutils/helper/address_helper'
|
27
|
+
require 'textutils/helper/value_helper'
|
26
28
|
|
27
29
|
require 'textutils/utils'
|
28
30
|
require 'textutils/reader/code_reader'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.11
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &75139420 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *75139420
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &75139200 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '3.10'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *75139200
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &75138980 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *75138980
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: webslideshow@googlegroups.com
|
49
49
|
executables: []
|
@@ -60,8 +60,10 @@ files:
|
|
60
60
|
- lib/textutils/filter/comment_filter.rb
|
61
61
|
- lib/textutils/filter/erb_django_filter.rb
|
62
62
|
- lib/textutils/filter/erb_filter.rb
|
63
|
+
- lib/textutils/helper/address_helper.rb
|
63
64
|
- lib/textutils/helper/title_helper.rb
|
64
65
|
- lib/textutils/helper/unicode_helper.rb
|
66
|
+
- lib/textutils/helper/value_helper.rb
|
65
67
|
- lib/textutils/reader/code_reader.rb
|
66
68
|
- lib/textutils/reader/fixture_reader.rb
|
67
69
|
- lib/textutils/reader/hash_reader.rb
|