textutils 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -1
- data/lib/textutils.rb +3 -1
- data/lib/textutils/helper/value_helper_i.rb +139 -0
- data/lib/textutils/helper/value_helper_ii.rb +83 -0
- data/lib/textutils/helper/value_helper_iii_numbers.rb +78 -0
- data/lib/textutils/parser/name_parser.rb +50 -1
- data/lib/textutils/version.rb +1 -1
- data/test/test_title_finder.rb +33 -4
- data/test/test_title_helper.rb +2 -1
- metadata +5 -3
- data/lib/textutils/helper/value_helper.rb +0 -249
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 16e24e7bc0a1004bc3fca49b6f3bfcaa6ce2e5ce
|
4
|
+
data.tar.gz: 6aa074466c9c89b089ecf0ac6d75164050abd0cb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f386324301ffc37deba32eb32202edf9b8706ff1b14971ba5a41db22a2d9a314be64914b4296e359cea2620a6b54ad89ae1cfa6e7c9fa1322ec4ef29020fb688
|
7
|
+
data.tar.gz: d56300976f712b8bb9de8f1e2e571948f35783090d1d7f1a238736f8efd6a60aba915d5ff807cb942a38f47f80b8d6917e6e1e846371b72b62022c8b3ad029f0
|
data/Manifest.txt
CHANGED
@@ -18,7 +18,9 @@ lib/textutils/helper/hypertext_helper.rb
|
|
18
18
|
lib/textutils/helper/tag_helper.rb
|
19
19
|
lib/textutils/helper/title_helper.rb
|
20
20
|
lib/textutils/helper/unicode_helper.rb
|
21
|
-
lib/textutils/helper/
|
21
|
+
lib/textutils/helper/value_helper_i.rb
|
22
|
+
lib/textutils/helper/value_helper_ii.rb
|
23
|
+
lib/textutils/helper/value_helper_iii_numbers.rb
|
22
24
|
lib/textutils/helper/xml_helper.rb
|
23
25
|
lib/textutils/page.rb
|
24
26
|
lib/textutils/parser/name_parser.rb
|
data/lib/textutils.rb
CHANGED
@@ -43,7 +43,9 @@ require 'textutils/helper/unicode_helper'
|
|
43
43
|
require 'textutils/helper/tag_helper'
|
44
44
|
require 'textutils/helper/title_helper'
|
45
45
|
require 'textutils/helper/address_helper'
|
46
|
-
require 'textutils/helper/
|
46
|
+
require 'textutils/helper/value_helper_i'
|
47
|
+
require 'textutils/helper/value_helper_ii'
|
48
|
+
require 'textutils/helper/value_helper_iii_numbers'
|
47
49
|
|
48
50
|
require 'textutils/utils'
|
49
51
|
require 'textutils/core_ext/file'
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module ValueHelper
|
6
|
+
|
7
|
+
# if it looks like a key (only a-z lower case allowed); assume it's a key
|
8
|
+
# - also allow . in keys e.g. world.quali.america, at.cup, etc.
|
9
|
+
# - also allow 0-9 in keys e.g. at.2, at.3.1, etc.
|
10
|
+
# - also allow leading digits e.g. 1850muenchen, 3kronen, etc.
|
11
|
+
|
12
|
+
TITLE_KEY_REGEX = /^(
|
13
|
+
[a-z][a-z0-9.]*[a-z0-9]
|
14
|
+
|
|
15
|
+
[a-z] # allow single letter keys e.g. n,s,etc.
|
16
|
+
|
|
17
|
+
[1-9][0-9]*[a-z]+ # NOTE: also allow starts with leading digits e.g. 1850muenchen, 3kronen etc.;
|
18
|
+
# *MUST* be followed by letter;
|
19
|
+
# note: leading zero for now *NOT* allowed
|
20
|
+
)$
|
21
|
+
/x
|
22
|
+
|
23
|
+
|
24
|
+
def find_key_n_title( values ) # note: returns ary [attribs,more_values] / two values
|
25
|
+
# todo/fix:
|
26
|
+
## change title to name
|
27
|
+
## change synonyms to alt_names (!!!)
|
28
|
+
## => use new method e.g. find_key_n_name(s) - why?? why not??
|
29
|
+
|
30
|
+
|
31
|
+
## fix: add/configure logger for ActiveRecord!!!
|
32
|
+
logger = LogKernel::Logger.root
|
33
|
+
|
34
|
+
|
35
|
+
### support autogenerate key from first title value
|
36
|
+
if values[0] =~ TITLE_KEY_REGEX
|
37
|
+
key_col = values[0]
|
38
|
+
title_col = values[1]
|
39
|
+
more_values = values[2..-1]
|
40
|
+
else
|
41
|
+
key_col = '<auto>'
|
42
|
+
title_col = values[0]
|
43
|
+
more_values = values[1..-1]
|
44
|
+
end
|
45
|
+
|
46
|
+
attribs = {}
|
47
|
+
|
48
|
+
## check title_col for grade (e.g. ***/**/*) and use returned stripped title_col if exits
|
49
|
+
grade, title_col = find_grade( title_col )
|
50
|
+
|
51
|
+
# NB: for now - do NOT include default grade e.g. if grade (***/**/*) not present; attrib will not be present too
|
52
|
+
if grade == 1 || grade == 2 || grade == 3 # grade found/present
|
53
|
+
logger.debug " found grade #{grade} in title"
|
54
|
+
attribs[:grade] = grade
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
## fix/todo: add find parts ??
|
59
|
+
# e.g. ‹Estrella› ‹Damm› Inedit
|
60
|
+
# becomes => title: 'Estrella Damm Inedit' and parts: ['Estrella','Damm']
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
## title (split of optional tree hierarchy)
|
65
|
+
## e.g. Leverkusen › Köln/Bonn › Nordrhein-Westfalen
|
66
|
+
## Gelsenkirchen › Ruhrgebiet › Nordrhein-Westfalen
|
67
|
+
## München [Munich] › Bayern etc.
|
68
|
+
|
69
|
+
## fix!!!! - trailing hierarchy get *ignored* for now!!! - fix!!
|
70
|
+
## pass along in :tree (or :hierarchy) ??
|
71
|
+
|
72
|
+
|
73
|
+
## note: must include leading and trailing space for now (fix!! later)
|
74
|
+
## hack for avoiding conflict w/ parts; fix: read/parse parts first
|
75
|
+
## todo: also allow > (as an alternative to ›)
|
76
|
+
|
77
|
+
title_tree = title_col.split( /[ ]+[›][ ]+/ )
|
78
|
+
|
79
|
+
## title (split of optional synonyms)
|
80
|
+
# e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
|
81
|
+
# München [Munich]
|
82
|
+
titles = NameTokenizer.new.tokenize( title_tree[0] )
|
83
|
+
|
84
|
+
attribs[ :title ] = titles[0]
|
85
|
+
|
86
|
+
## add optional synonyms if present
|
87
|
+
attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
|
88
|
+
|
89
|
+
if key_col == '<auto>'
|
90
|
+
## autogenerate key from first title
|
91
|
+
key_col = TextUtils.title_to_key( titles[0] )
|
92
|
+
logger.debug " autogen key »#{key_col}« from title »#{titles[0]}«"
|
93
|
+
end
|
94
|
+
|
95
|
+
attribs[ :key ] = key_col
|
96
|
+
|
97
|
+
[attribs, more_values]
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def find_grade( value ) # NB: returns ary [grade,value] / two values
|
102
|
+
grade = 4 # defaults to grade 4 e.g *** => 1, ** => 2, * => 3, -/- => 4
|
103
|
+
|
104
|
+
# NB: stars must end field/value or start field/value
|
105
|
+
# e.g.
|
106
|
+
# *** Anton Bauer or
|
107
|
+
# Anton Bauer ***
|
108
|
+
|
109
|
+
value = value.sub( /^\s*(\*{1,3})\s+/ ) do |_|
|
110
|
+
if $1 == '***'
|
111
|
+
grade = 1
|
112
|
+
elsif $1 == '**'
|
113
|
+
grade = 2
|
114
|
+
elsif $1 == '*'
|
115
|
+
grade = 3
|
116
|
+
else
|
117
|
+
# unknown grade; not possible, is'it?
|
118
|
+
end
|
119
|
+
'' # remove * from title if found
|
120
|
+
end
|
121
|
+
|
122
|
+
value = value.sub( /\s+(\*{1,3})\s*$/ ) do |_|
|
123
|
+
if $1 == '***'
|
124
|
+
grade = 1
|
125
|
+
elsif $1 == '**'
|
126
|
+
grade = 2
|
127
|
+
elsif $1 == '*'
|
128
|
+
grade = 3
|
129
|
+
else
|
130
|
+
# unknown grade; not possible, is'it?
|
131
|
+
end
|
132
|
+
'' # remove * from title if found
|
133
|
+
end
|
134
|
+
|
135
|
+
[grade,value]
|
136
|
+
end
|
137
|
+
|
138
|
+
end # module ValueHelper
|
139
|
+
end # module TextUtils
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module ValueHelper
|
6
|
+
|
7
|
+
#####
|
8
|
+
## fix!!!!: move to beerdb ??? why? why not?? - yes, move to beerdb-models
|
9
|
+
|
10
|
+
def match_brewery( value )
|
11
|
+
if value =~ /^by:/ ## by: -brewed by/brewery
|
12
|
+
brewery_key = value[3..-1] ## cut off by: prefix
|
13
|
+
brewery = BeerDb::Model::Brewery.find_by_key!( brewery_key )
|
14
|
+
yield( brewery )
|
15
|
+
true # bingo - match found
|
16
|
+
else
|
17
|
+
false # no match found
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def is_year?( value )
|
23
|
+
# founded/established year e.g. 1776
|
24
|
+
match_result = value =~ /^[0-9]{4}$/
|
25
|
+
# match found if 0,1,2,3 etc or no match if nil
|
26
|
+
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
27
|
+
match_result != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def match_year( value )
|
32
|
+
if is_year?( value ) # founded/established year e.g. 1776
|
33
|
+
yield( value.to_i )
|
34
|
+
true # bingo - match found
|
35
|
+
else
|
36
|
+
false # no match found
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
def is_address?( value )
|
42
|
+
# if value includes // assume address e.g. 3970 Weitra // Sparkasseplatz 160
|
43
|
+
match_result = value =~ /\/{2}/
|
44
|
+
# match found if 0,1,2,3 etc or no match if nil
|
45
|
+
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
46
|
+
match_result != nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def is_taglist?( value )
|
50
|
+
### note: cannot start w/ number must be letter for now
|
51
|
+
## -- in the future allow free standing years (e.g. 1980 etc.?? why? why not?)
|
52
|
+
## e.g. not allowed 14 ha or 5_000 hl etc.
|
53
|
+
match_result = value =~ /^([a-z][a-z0-9\|_ ]*[a-z0-9]|[a-z])$/
|
54
|
+
# match found if 0,1,2,3 etc or no match if nil
|
55
|
+
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
56
|
+
match_result != nil
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def is_website?( value )
|
61
|
+
# check for url/internet address e.g. www.ottakringer.at
|
62
|
+
# - must start w/ www. or
|
63
|
+
# - must end w/ .com
|
64
|
+
#
|
65
|
+
# fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
|
66
|
+
match_result = value =~ /^www\.|\.com$/
|
67
|
+
# match found if 0,1,2,3 etc or no match if nil
|
68
|
+
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
69
|
+
match_result != nil
|
70
|
+
end
|
71
|
+
|
72
|
+
def match_website( value )
|
73
|
+
if is_website?( value ) # check for url/internet address e.g. www.ottakringer.at
|
74
|
+
# fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
|
75
|
+
yield( value )
|
76
|
+
true # bingo - match found
|
77
|
+
else
|
78
|
+
false # no match found
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end # module ValueHelper
|
83
|
+
end # module TextUtils
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
# match numbers (units)
|
5
|
+
# e.g km_squared, abv, etc.
|
6
|
+
|
7
|
+
module TextUtils
|
8
|
+
module ValueHelper
|
9
|
+
|
10
|
+
|
11
|
+
def match_number( value )
|
12
|
+
## numeric
|
13
|
+
## note: can use any _ or spaces inside digits e.g. 1_000_000 or 1 000 000)
|
14
|
+
if value =~ /^([0-9][0-9 _]+[0-9])|([0-9]{1,2})$/
|
15
|
+
num = value.gsub(/[ _]/, '').to_i
|
16
|
+
yield( num )
|
17
|
+
true # bingo - match found
|
18
|
+
else
|
19
|
+
false # no match found
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
###########################
|
25
|
+
## numbers w/ units
|
26
|
+
|
27
|
+
def match_km_squared( value )
|
28
|
+
## allow numbers like 453 km² or 45_000 km2
|
29
|
+
if value =~ /^([0-9][0-9 _]+[0-9]|[0-9]{1,2})(?:\s*(?:km2|km²)\s*)$/
|
30
|
+
num = value.gsub( 'km2', '').gsub( 'km²', '' ).gsub(/[ _]/, '').to_i
|
31
|
+
yield( num )
|
32
|
+
true # bingo - match found
|
33
|
+
else
|
34
|
+
false # no match found
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def match_abv( value ) # alcohol by volume (abv) e.g. 5.2%
|
39
|
+
if value =~ /^<?\s*(\d+(?:\.\d+)?)\s*%$/
|
40
|
+
# nb: allow leading < e.g. <0.5%
|
41
|
+
yield( $1.to_f ) # convert to decimal? how? use float?
|
42
|
+
true # bingo - match found
|
43
|
+
else
|
44
|
+
false # no match found
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def match_og( value ) # plato (stammwuerze/gravity?) e.g. 11.2°
|
49
|
+
if value =~ /^(\d+(?:\.\d+)?)°$/
|
50
|
+
# nb: no whitespace allowed between ° and number e.g. 11.2°
|
51
|
+
yield( $1.to_f ) # convert to decimal? how? use float?
|
52
|
+
true # bingo - match found
|
53
|
+
else
|
54
|
+
false # no match found
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def match_kcal( value )
|
59
|
+
if value =~ /^(\d+(?:\.\d+)?)\s*kcal(?:\/100ml)?$/ # kcal
|
60
|
+
# nb: allow 44.4 kcal/100ml or 44.4 kcal or 44.4kcal
|
61
|
+
yield( $1.to_f ) # convert to decimal? how? use float?
|
62
|
+
true # bingo - match found
|
63
|
+
else
|
64
|
+
false # no match found
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def match_hl( value ) # hector liters (hl) 1hl = 100l
|
69
|
+
if value =~ /^(?:([0-9][0-9_ ]+[0-9]|[0-9]{1,2})\s*hl)$/ # e.g. 20_000 hl or 50hl etc.
|
70
|
+
yield( $1.gsub( /[ _]/, '' ).to_i )
|
71
|
+
true # bingo - match found
|
72
|
+
else
|
73
|
+
false # no match found
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end # module ValueHelper
|
78
|
+
end # module TextUtils
|
@@ -2,6 +2,55 @@
|
|
2
2
|
|
3
3
|
# fix: move into TextUtils namespace/module!! ??
|
4
4
|
|
5
|
+
|
6
|
+
class NameTokenizer ## - rename to NameScanner, NameSplitter, NameSeparator, etc.
|
7
|
+
|
8
|
+
## split (single) string value into array of names
|
9
|
+
## e.g.
|
10
|
+
## 'München [Munich]' => ['München', '[Munich]']
|
11
|
+
## 'Wr. Neustadt | Wiener Neustadt' => ['Wr. Neustadt', 'Wiener Neustadt']
|
12
|
+
include LogUtils::Logging
|
13
|
+
|
14
|
+
def tokenize( value ) ## rename to/use split - why? why not??
|
15
|
+
names = []
|
16
|
+
|
17
|
+
# 1) split by | (pipe) -- remove leading n trailing whitespaces
|
18
|
+
parts = value.split( /[ \t]*\|[ \t]*/ )
|
19
|
+
|
20
|
+
# 2) split "inline" translations e.g. München [Munich]
|
21
|
+
|
22
|
+
## todo: add support for Munich [en] e.g. trailing lang tag
|
23
|
+
## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??
|
24
|
+
|
25
|
+
parts.each do |part|
|
26
|
+
s = StringScanner.new( part )
|
27
|
+
s.skip( /[ \t]+/) # skip whitespaces
|
28
|
+
|
29
|
+
while s.eos? == false
|
30
|
+
if s.check( /\[/ )
|
31
|
+
## scan everything until the end of bracket (e.g.])
|
32
|
+
name = s.scan( /\[[^\]]+\]/)
|
33
|
+
## todo/fix: if name nil - issue warning??
|
34
|
+
# starting w/ [ but no closing ] found !!!! - possible? fix!!
|
35
|
+
else
|
36
|
+
## scan everything until the begin of bracket (e.g.[)
|
37
|
+
name = s.scan( /[^\[]+/)
|
38
|
+
name = name.rstrip ## remove trailing spaces (if present)
|
39
|
+
end
|
40
|
+
names << name
|
41
|
+
|
42
|
+
s.skip( /[ \t]+/) # skip whitespaces
|
43
|
+
logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
|
44
|
+
end
|
45
|
+
end # each part
|
46
|
+
|
47
|
+
logger.debug( "[NameTokenizer] names=#{names.inspect}")
|
48
|
+
names
|
49
|
+
end # method split
|
50
|
+
end # class NameTokenizer
|
51
|
+
|
52
|
+
|
53
|
+
=begin
|
5
54
|
class NameParser
|
6
55
|
|
7
56
|
include LogUtils::Logging
|
@@ -9,4 +58,4 @@ class NameParser
|
|
9
58
|
## to be done
|
10
59
|
|
11
60
|
end # class NameParser
|
12
|
-
|
61
|
+
=end
|
data/lib/textutils/version.rb
CHANGED
data/test/test_title_finder.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_title_finder.rb
|
6
|
+
|
3
7
|
|
4
8
|
require 'helper'
|
5
9
|
|
@@ -8,18 +12,43 @@ class TestTitleFinder < Minitest::Test
|
|
8
12
|
|
9
13
|
include TextUtils::ValueHelper # lets us use find_grade, etc.
|
10
14
|
|
11
|
-
def test_grade
|
12
15
|
|
16
|
+
def test_find_key_n_title
|
17
|
+
attribs, _ = find_key_n_title( ['München [Munich]'] ) ## skip returned more_values (use _)
|
18
|
+
assert_equal 'muenchen', attribs[:key]
|
19
|
+
assert_equal 'München', attribs[:title]
|
20
|
+
assert_equal '[Munich]', attribs[:synonyms]
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_find_key_n_title_w_tree
|
24
|
+
attribs, _ = find_key_n_title( ['München [Munich] › Oberbayern › Bayern'] ) ## skip returned more_values (use _)
|
25
|
+
assert_equal 'muenchen', attribs[:key]
|
26
|
+
assert_equal 'München', attribs[:title]
|
27
|
+
assert_equal '[Munich]', attribs[:synonyms]
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def test_title_tokenizer
|
32
|
+
names = NameTokenizer.new.tokenize( 'München [Munich]' )
|
33
|
+
assert_equal 2, names.size
|
34
|
+
assert_equal 'München', names[0]
|
35
|
+
assert_equal '[Munich]', names[1]
|
36
|
+
|
37
|
+
names = NameTokenizer.new.tokenize( 'FC Bayern Muenchen|Bayern Muenchen|Bayern' )
|
38
|
+
assert_equal 3, names.size
|
39
|
+
assert_equal 'FC Bayern Muenchen', names[0]
|
40
|
+
assert_equal 'Bayern Muenchen', names[1]
|
41
|
+
assert_equal 'Bayern', names[2]
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_grade
|
13
45
|
assert_equal [1,'Anton Bauer'], find_grade( '*** Anton Bauer' )
|
14
46
|
assert_equal [2,'Anton Bauer'], find_grade( '** Anton Bauer' )
|
15
47
|
assert_equal [3,'Anton Bauer'], find_grade( '* Anton Bauer' )
|
16
48
|
assert_equal [4,'Anton Bauer'], find_grade( 'Anton Bauer' )
|
17
49
|
|
18
50
|
assert_equal [1,'Anton Bauer'], find_grade( 'Anton Bauer ***' )
|
19
|
-
|
20
51
|
end
|
21
52
|
|
22
|
-
|
23
|
-
|
24
53
|
end # class TestTitleFinder
|
25
54
|
|
data/test/test_title_helper.rb
CHANGED
@@ -62,7 +62,8 @@ class TestTitleHelper < Minitest::Test
|
|
62
62
|
[ "Ṣan'ā' [Sana'a]", 'sana'],
|
63
63
|
[ "P'yŏngyang [Pyongyang]", 'pyongyang' ],
|
64
64
|
[ 'Kāṭhmāḍaũ [Kathmandu]', 'kathmadau' ],
|
65
|
-
[ "Pe\u{030C}awar", 'pexawar'] ## note: use unicode literal; Pex̌awar -- see en.wikipedia.org/wiki/Peshawar
|
65
|
+
[ "Pe\u{030C}awar", 'pexawar'], ## note: use unicode literal; Pex̌awar -- see en.wikipedia.org/wiki/Peshawar
|
66
|
+
[ '1850 München', '1850muenchen'],
|
66
67
|
]
|
67
68
|
|
68
69
|
txt_io.each do |txt|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: props
|
@@ -124,7 +124,9 @@ files:
|
|
124
124
|
- lib/textutils/helper/tag_helper.rb
|
125
125
|
- lib/textutils/helper/title_helper.rb
|
126
126
|
- lib/textutils/helper/unicode_helper.rb
|
127
|
-
- lib/textutils/helper/
|
127
|
+
- lib/textutils/helper/value_helper_i.rb
|
128
|
+
- lib/textutils/helper/value_helper_ii.rb
|
129
|
+
- lib/textutils/helper/value_helper_iii_numbers.rb
|
128
130
|
- lib/textutils/helper/xml_helper.rb
|
129
131
|
- lib/textutils/page.rb
|
130
132
|
- lib/textutils/parser/name_parser.rb
|
@@ -1,249 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module TextUtils
|
5
|
-
module ValueHelper
|
6
|
-
|
7
|
-
#####
|
8
|
-
## fix: move to beerdb ??? why? why not?? - yes, move to beerdb-models
|
9
|
-
|
10
|
-
def match_brewery( value )
|
11
|
-
if value =~ /^by:/ ## by: -brewed by/brewery
|
12
|
-
brewery_key = value[3..-1] ## cut off by: prefix
|
13
|
-
brewery = BeerDb::Model::Brewery.find_by_key!( brewery_key )
|
14
|
-
yield( brewery )
|
15
|
-
true # bingo - match found
|
16
|
-
else
|
17
|
-
false # no match found
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
def is_year?( value )
|
23
|
-
# founded/established year e.g. 1776
|
24
|
-
match_result = value =~ /^[0-9]{4}$/
|
25
|
-
# match found if 0,1,2,3 etc or no match if nil
|
26
|
-
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
27
|
-
match_result != nil
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
def match_year( value )
|
32
|
-
if is_year?( value ) # founded/established year e.g. 1776
|
33
|
-
yield( value.to_i )
|
34
|
-
true # bingo - match found
|
35
|
-
else
|
36
|
-
false # no match found
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
def match_km_squared( value )
|
42
|
-
## allow numbers like 453 km² or 45_000 km2
|
43
|
-
if value =~ /^([0-9][0-9 _]+[0-9]|[0-9]{1,2})(?:\s*(?:km2|km²)\s*)$/
|
44
|
-
num = value.gsub( 'km2', '').gsub( 'km²', '' ).gsub(/[ _]/, '').to_i
|
45
|
-
yield( num )
|
46
|
-
true # bingo - match found
|
47
|
-
else
|
48
|
-
false # no match found
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def match_number( value )
|
53
|
-
## numeric (nb: can use any _ or spaces inside digits e.g. 1_000_000 or 1 000 000)
|
54
|
-
if value =~ /^([0-9][0-9 _]+[0-9])|([0-9]{1,2})$/
|
55
|
-
num = value.gsub(/[ _]/, '').to_i
|
56
|
-
yield( num )
|
57
|
-
true # bingo - match found
|
58
|
-
else
|
59
|
-
false # no match found
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
|
64
|
-
def match_abv( value ) # alcohol by volume (abv) e.g. 5.2%
|
65
|
-
if value =~ /^<?\s*(\d+(?:\.\d+)?)\s*%$/
|
66
|
-
# nb: allow leading < e.g. <0.5%
|
67
|
-
yield( $1.to_f ) # convert to decimal? how? use float?
|
68
|
-
true # bingo - match found
|
69
|
-
else
|
70
|
-
false # no match found
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def match_og( value ) # plato (stammwuerze/gravity?) e.g. 11.2°
|
75
|
-
if value =~ /^(\d+(?:\.\d+)?)°$/
|
76
|
-
# nb: no whitespace allowed between ° and number e.g. 11.2°
|
77
|
-
yield( $1.to_f ) # convert to decimal? how? use float?
|
78
|
-
true # bingo - match found
|
79
|
-
else
|
80
|
-
false # no match found
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def match_kcal( value )
|
85
|
-
if value =~ /^(\d+(?:\.\d+)?)\s*kcal(?:\/100ml)?$/ # kcal
|
86
|
-
# nb: allow 44.4 kcal/100ml or 44.4 kcal or 44.4kcal
|
87
|
-
yield( $1.to_f ) # convert to decimal? how? use float?
|
88
|
-
true # bingo - match found
|
89
|
-
else
|
90
|
-
false # no match found
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def match_hl( value ) # hector liters (hl) 1hl = 100l
|
95
|
-
if value =~ /^(?:([0-9][0-9_ ]+[0-9]|[0-9]{1,2})\s*hl)$/ # e.g. 20_000 hl or 50hl etc.
|
96
|
-
yield( $1.gsub( /[ _]/, '' ).to_i )
|
97
|
-
true # bingo - match found
|
98
|
-
else
|
99
|
-
false # no match found
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
|
104
|
-
def is_website?( value )
|
105
|
-
# check for url/internet address e.g. www.ottakringer.at
|
106
|
-
# - must start w/ www. or
|
107
|
-
# - must end w/ .com
|
108
|
-
#
|
109
|
-
# fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
|
110
|
-
match_result = value =~ /^www\.|\.com$/
|
111
|
-
# match found if 0,1,2,3 etc or no match if nil
|
112
|
-
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
113
|
-
match_result != nil
|
114
|
-
end
|
115
|
-
|
116
|
-
def match_website( value )
|
117
|
-
if is_website?( value ) # check for url/internet address e.g. www.ottakringer.at
|
118
|
-
# fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
|
119
|
-
yield( value )
|
120
|
-
true # bingo - match found
|
121
|
-
else
|
122
|
-
false # no match found
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
def is_address?( value )
|
129
|
-
# if value includes // assume address e.g. 3970 Weitra // Sparkasseplatz 160
|
130
|
-
match_result = value =~ /\/{2}/
|
131
|
-
# match found if 0,1,2,3 etc or no match if nil
|
132
|
-
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
133
|
-
match_result != nil
|
134
|
-
end
|
135
|
-
|
136
|
-
def is_taglist?( value )
|
137
|
-
### note: cannot start w/ number must be letter for now
|
138
|
-
## -- in the future allow free standing years (e.g. 1980 etc.?? why? why not?)
|
139
|
-
## e.g. not allowed 14 ha or 5_000 hl etc.
|
140
|
-
match_result = value =~ /^([a-z][a-z0-9\|_ ]*[a-z0-9]|[a-z])$/
|
141
|
-
# match found if 0,1,2,3 etc or no match if nil
|
142
|
-
# note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
|
143
|
-
match_result != nil
|
144
|
-
end
|
145
|
-
|
146
|
-
|
147
|
-
def find_grade( value ) # NB: returns ary [grade,value] / two values
|
148
|
-
grade = 4 # defaults to grade 4 e.g *** => 1, ** => 2, * => 3, -/- => 4
|
149
|
-
|
150
|
-
# NB: stars must end field/value or start field/value
|
151
|
-
# e.g.
|
152
|
-
# *** Anton Bauer or
|
153
|
-
# Anton Bauer ***
|
154
|
-
|
155
|
-
value = value.sub( /^\s*(\*{1,3})\s+/ ) do |_|
|
156
|
-
if $1 == '***'
|
157
|
-
grade = 1
|
158
|
-
elsif $1 == '**'
|
159
|
-
grade = 2
|
160
|
-
elsif $1 == '*'
|
161
|
-
grade = 3
|
162
|
-
else
|
163
|
-
# unknown grade; not possible, is'it?
|
164
|
-
end
|
165
|
-
'' # remove * from title if found
|
166
|
-
end
|
167
|
-
|
168
|
-
value = value.sub( /\s+(\*{1,3})\s*$/ ) do |_|
|
169
|
-
if $1 == '***'
|
170
|
-
grade = 1
|
171
|
-
elsif $1 == '**'
|
172
|
-
grade = 2
|
173
|
-
elsif $1 == '*'
|
174
|
-
grade = 3
|
175
|
-
else
|
176
|
-
# unknown grade; not possible, is'it?
|
177
|
-
end
|
178
|
-
'' # remove * from title if found
|
179
|
-
end
|
180
|
-
|
181
|
-
[grade,value]
|
182
|
-
end
|
183
|
-
|
184
|
-
|
185
|
-
def find_key_n_title( values ) # NB: returns ary [attribs,more_values] / two values
|
186
|
-
|
187
|
-
## fix: add/configure logger for ActiveRecord!!!
|
188
|
-
logger = LogKernel::Logger.root
|
189
|
-
|
190
|
-
### todo/fix: allow check - do NOT allow mixed use of with key and w/o key
|
191
|
-
## either use keys or do NOT use keys; do NOT mix in a single fixture file
|
192
|
-
|
193
|
-
### support autogenerate key from first title value
|
194
|
-
|
195
|
-
# if it looks like a key (only a-z lower case allowed); assume it's a key
|
196
|
-
# - also allow . in keys e.g. world.quali.america, at.cup, etc.
|
197
|
-
# - also allow 0-9 in keys e.g. at.2, at.3.1, etc.
|
198
|
-
|
199
|
-
# fix/todo: add support for leading underscore _
|
200
|
-
# or allow keys starting w/ digits?
|
201
|
-
|
202
|
-
# NB: key must start w/ a-z letter (NB: minimum one letter possible)
|
203
|
-
if values[0] =~ /^([a-z][a-z0-9.]*[a-z0-9]|[a-z])$/
|
204
|
-
key_col = values[0]
|
205
|
-
title_col = values[1]
|
206
|
-
more_values = values[2..-1]
|
207
|
-
else
|
208
|
-
key_col = '<auto>'
|
209
|
-
title_col = values[0]
|
210
|
-
more_values = values[1..-1]
|
211
|
-
end
|
212
|
-
|
213
|
-
attribs = {}
|
214
|
-
|
215
|
-
## check title_col for grade (e.g. ***/**/*) and use returned stripped title_col if exits
|
216
|
-
grade, title_col = find_grade( title_col )
|
217
|
-
|
218
|
-
# NB: for now - do NOT include default grade e.g. if grade (***/**/*) not present; attrib will not be present too
|
219
|
-
if grade == 1 || grade == 2 || grade == 3 # grade found/present
|
220
|
-
logger.debug " found grade #{grade} in title"
|
221
|
-
attribs[:grade] = grade
|
222
|
-
end
|
223
|
-
|
224
|
-
## fix/todo: add find parts ??
|
225
|
-
# e.g. ‹Estrella› ‹Damm› Inedit
|
226
|
-
# becomes => title: 'Estrella Damm Inedit' and parts: ['Estrella','Damm']
|
227
|
-
|
228
|
-
## title (split of optional synonyms)
|
229
|
-
# e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
|
230
|
-
titles = title_col.split('|')
|
231
|
-
|
232
|
-
attribs[ :title ] = titles[0]
|
233
|
-
|
234
|
-
## add optional synonyms if present
|
235
|
-
attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
|
236
|
-
|
237
|
-
if key_col == '<auto>'
|
238
|
-
## autogenerate key from first title
|
239
|
-
key_col = TextUtils.title_to_key( titles[0] )
|
240
|
-
logger.debug " autogen key »#{key_col}« from title »#{titles[0]}«"
|
241
|
-
end
|
242
|
-
|
243
|
-
attribs[ :key ] = key_col
|
244
|
-
|
245
|
-
[attribs, more_values]
|
246
|
-
end
|
247
|
-
|
248
|
-
end # module ValueHelper
|
249
|
-
end # module TextUtils
|