textutils 1.2.3 → 1.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 13b50f88222d75883a9804ba452aa3dce5f4c517
4
- data.tar.gz: 08c6eee7b70caf74b716fd4ed3a8d1d1d0c10967
3
+ metadata.gz: 16e24e7bc0a1004bc3fca49b6f3bfcaa6ce2e5ce
4
+ data.tar.gz: 6aa074466c9c89b089ecf0ac6d75164050abd0cb
5
5
  SHA512:
6
- metadata.gz: 1ce49f78d634c23a8843893eb7afea9a3b8f9aa1fc9c713f6cd22ff47dadfbf4a893ce96b0f91ecfc118e6e65624b9474a1f0e80e83a6d7053a1894a19f3fe2b
7
- data.tar.gz: 5361f93e31737603810fd17b2816677274612632d3cae45aff9b96031e253e36ada6fa3ad084bded187bc527338570cde761fb2c3e96f52f647fad798abca263
6
+ metadata.gz: f386324301ffc37deba32eb32202edf9b8706ff1b14971ba5a41db22a2d9a314be64914b4296e359cea2620a6b54ad89ae1cfa6e7c9fa1322ec4ef29020fb688
7
+ data.tar.gz: d56300976f712b8bb9de8f1e2e571948f35783090d1d7f1a238736f8efd6a60aba915d5ff807cb942a38f47f80b8d6917e6e1e846371b72b62022c8b3ad029f0
@@ -18,7 +18,9 @@ lib/textutils/helper/hypertext_helper.rb
18
18
  lib/textutils/helper/tag_helper.rb
19
19
  lib/textutils/helper/title_helper.rb
20
20
  lib/textutils/helper/unicode_helper.rb
21
- lib/textutils/helper/value_helper.rb
21
+ lib/textutils/helper/value_helper_i.rb
22
+ lib/textutils/helper/value_helper_ii.rb
23
+ lib/textutils/helper/value_helper_iii_numbers.rb
22
24
  lib/textutils/helper/xml_helper.rb
23
25
  lib/textutils/page.rb
24
26
  lib/textutils/parser/name_parser.rb
@@ -43,7 +43,9 @@ require 'textutils/helper/unicode_helper'
43
43
  require 'textutils/helper/tag_helper'
44
44
  require 'textutils/helper/title_helper'
45
45
  require 'textutils/helper/address_helper'
46
- require 'textutils/helper/value_helper'
46
+ require 'textutils/helper/value_helper_i'
47
+ require 'textutils/helper/value_helper_ii'
48
+ require 'textutils/helper/value_helper_iii_numbers'
47
49
 
48
50
  require 'textutils/utils'
49
51
  require 'textutils/core_ext/file'
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module TextUtils
5
+ module ValueHelper
6
+
7
+ # if it looks like a key (only a-z lower case allowed); assume it's a key
8
+ # - also allow . in keys e.g. world.quali.america, at.cup, etc.
9
+ # - also allow 0-9 in keys e.g. at.2, at.3.1, etc.
10
+ # - also allow leading digits e.g. 1850muenchen, 3kronen, etc.
11
+
12
+ TITLE_KEY_REGEX = /^(
13
+ [a-z][a-z0-9.]*[a-z0-9]
14
+ |
15
+ [a-z] # allow single letter keys e.g. n,s,etc.
16
+ |
17
+ [1-9][0-9]*[a-z]+ # NOTE: also allow starts with leading digits e.g. 1850muenchen, 3kronen etc.;
18
+ # *MUST* be followed by letter;
19
+ # note: leading zero for now *NOT* allowed
20
+ )$
21
+ /x
22
+
23
+
24
+ def find_key_n_title( values ) # note: returns ary [attribs,more_values] / two values
25
+ # todo/fix:
26
+ ## change title to name
27
+ ## change synonyms to alt_names (!!!)
28
+ ## => use new method e.g. find_key_n_name(s) - why?? why not??
29
+
30
+
31
+ ## fix: add/configure logger for ActiveRecord!!!
32
+ logger = LogKernel::Logger.root
33
+
34
+
35
+ ### support autogenerate key from first title value
36
+ if values[0] =~ TITLE_KEY_REGEX
37
+ key_col = values[0]
38
+ title_col = values[1]
39
+ more_values = values[2..-1]
40
+ else
41
+ key_col = '<auto>'
42
+ title_col = values[0]
43
+ more_values = values[1..-1]
44
+ end
45
+
46
+ attribs = {}
47
+
48
+ ## check title_col for grade (e.g. ***/**/*) and use returned stripped title_col if exits
49
+ grade, title_col = find_grade( title_col )
50
+
51
+ # NB: for now - do NOT include default grade e.g. if grade (***/**/*) not present; attrib will not be present too
52
+ if grade == 1 || grade == 2 || grade == 3 # grade found/present
53
+ logger.debug " found grade #{grade} in title"
54
+ attribs[:grade] = grade
55
+ end
56
+
57
+
58
+ ## fix/todo: add find parts ??
59
+ # e.g. ‹Estrella› ‹Damm› Inedit
60
+ # becomes => title: 'Estrella Damm Inedit' and parts: ['Estrella','Damm']
61
+
62
+
63
+
64
+ ## title (split of optional tree hierarchy)
65
+ ## e.g. Leverkusen › Köln/Bonn › Nordrhein-Westfalen
66
+ ## Gelsenkirchen › Ruhrgebiet › Nordrhein-Westfalen
67
+ ## München [Munich] › Bayern etc.
68
+
69
+ ## fix!!!! - trailing hierarchy get *ignored* for now!!! - fix!!
70
+ ## pass along in :tree (or :hierarchy) ??
71
+
72
+
73
+ ## note: must include leading and trailing space for now (fix!! later)
74
+ ## hack for avoiding conflict w/ parts; fix: read/parse parts first
75
+ ## todo: also allow > (as an alternative to ›)
76
+
77
+ title_tree = title_col.split( /[ ]+[›][ ]+/ )
78
+
79
+ ## title (split of optional synonyms)
80
+ # e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
81
+ # München [Munich]
82
+ titles = NameTokenizer.new.tokenize( title_tree[0] )
83
+
84
+ attribs[ :title ] = titles[0]
85
+
86
+ ## add optional synonyms if present
87
+ attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
88
+
89
+ if key_col == '<auto>'
90
+ ## autogenerate key from first title
91
+ key_col = TextUtils.title_to_key( titles[0] )
92
+ logger.debug " autogen key »#{key_col}« from title »#{titles[0]}«"
93
+ end
94
+
95
+ attribs[ :key ] = key_col
96
+
97
+ [attribs, more_values]
98
+ end
99
+
100
+
101
+ def find_grade( value ) # NB: returns ary [grade,value] / two values
102
+ grade = 4 # defaults to grade 4 e.g *** => 1, ** => 2, * => 3, -/- => 4
103
+
104
+ # NB: stars must end field/value or start field/value
105
+ # e.g.
106
+ # *** Anton Bauer or
107
+ # Anton Bauer ***
108
+
109
+ value = value.sub( /^\s*(\*{1,3})\s+/ ) do |_|
110
+ if $1 == '***'
111
+ grade = 1
112
+ elsif $1 == '**'
113
+ grade = 2
114
+ elsif $1 == '*'
115
+ grade = 3
116
+ else
117
+ # unknown grade; not possible, is'it?
118
+ end
119
+ '' # remove * from title if found
120
+ end
121
+
122
+ value = value.sub( /\s+(\*{1,3})\s*$/ ) do |_|
123
+ if $1 == '***'
124
+ grade = 1
125
+ elsif $1 == '**'
126
+ grade = 2
127
+ elsif $1 == '*'
128
+ grade = 3
129
+ else
130
+ # unknown grade; not possible, is'it?
131
+ end
132
+ '' # remove * from title if found
133
+ end
134
+
135
+ [grade,value]
136
+ end
137
+
138
+ end # module ValueHelper
139
+ end # module TextUtils
@@ -0,0 +1,83 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module TextUtils
5
+ module ValueHelper
6
+
7
+ #####
8
+ ## fix!!!!: move to beerdb ??? why? why not?? - yes, move to beerdb-models
9
+
10
+ def match_brewery( value )
11
+ if value =~ /^by:/ ## by: -brewed by/brewery
12
+ brewery_key = value[3..-1] ## cut off by: prefix
13
+ brewery = BeerDb::Model::Brewery.find_by_key!( brewery_key )
14
+ yield( brewery )
15
+ true # bingo - match found
16
+ else
17
+ false # no match found
18
+ end
19
+ end
20
+
21
+
22
+ def is_year?( value )
23
+ # founded/established year e.g. 1776
24
+ match_result = value =~ /^[0-9]{4}$/
25
+ # match found if 0,1,2,3 etc or no match if nil
26
+ # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
27
+ match_result != nil
28
+ end
29
+
30
+
31
+ def match_year( value )
32
+ if is_year?( value ) # founded/established year e.g. 1776
33
+ yield( value.to_i )
34
+ true # bingo - match found
35
+ else
36
+ false # no match found
37
+ end
38
+ end
39
+
40
+
41
+ def is_address?( value )
42
+ # if value includes // assume address e.g. 3970 Weitra // Sparkasseplatz 160
43
+ match_result = value =~ /\/{2}/
44
+ # match found if 0,1,2,3 etc or no match if nil
45
+ # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
46
+ match_result != nil
47
+ end
48
+
49
+ def is_taglist?( value )
50
+ ### note: cannot start w/ number must be letter for now
51
+ ## -- in the future allow free standing years (e.g. 1980 etc.?? why? why not?)
52
+ ## e.g. not allowed 14 ha or 5_000 hl etc.
53
+ match_result = value =~ /^([a-z][a-z0-9\|_ ]*[a-z0-9]|[a-z])$/
54
+ # match found if 0,1,2,3 etc or no match if nil
55
+ # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
56
+ match_result != nil
57
+ end
58
+
59
+
60
+ def is_website?( value )
61
+ # check for url/internet address e.g. www.ottakringer.at
62
+ # - must start w/ www. or
63
+ # - must end w/ .com
64
+ #
65
+ # fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
66
+ match_result = value =~ /^www\.|\.com$/
67
+ # match found if 0,1,2,3 etc or no match if nil
68
+ # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
69
+ match_result != nil
70
+ end
71
+
72
+ def match_website( value )
73
+ if is_website?( value ) # check for url/internet address e.g. www.ottakringer.at
74
+ # fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
75
+ yield( value )
76
+ true # bingo - match found
77
+ else
78
+ false # no match found
79
+ end
80
+ end
81
+
82
+ end # module ValueHelper
83
+ end # module TextUtils
@@ -0,0 +1,78 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ # match numbers (units)
5
+ # e.g km_squared, abv, etc.
6
+
7
+ module TextUtils
8
+ module ValueHelper
9
+
10
+
11
+ def match_number( value )
12
+ ## numeric
13
+ ## note: can use any _ or spaces inside digits e.g. 1_000_000 or 1 000 000)
14
+ if value =~ /^([0-9][0-9 _]+[0-9])|([0-9]{1,2})$/
15
+ num = value.gsub(/[ _]/, '').to_i
16
+ yield( num )
17
+ true # bingo - match found
18
+ else
19
+ false # no match found
20
+ end
21
+ end
22
+
23
+
24
+ ###########################
25
+ ## numbers w/ units
26
+
27
+ def match_km_squared( value )
28
+ ## allow numbers like 453 km² or 45_000 km2
29
+ if value =~ /^([0-9][0-9 _]+[0-9]|[0-9]{1,2})(?:\s*(?:km2|km²)\s*)$/
30
+ num = value.gsub( 'km2', '').gsub( 'km²', '' ).gsub(/[ _]/, '').to_i
31
+ yield( num )
32
+ true # bingo - match found
33
+ else
34
+ false # no match found
35
+ end
36
+ end
37
+
38
+ def match_abv( value ) # alcohol by volume (abv) e.g. 5.2%
39
+ if value =~ /^<?\s*(\d+(?:\.\d+)?)\s*%$/
40
+ # nb: allow leading < e.g. <0.5%
41
+ yield( $1.to_f ) # convert to decimal? how? use float?
42
+ true # bingo - match found
43
+ else
44
+ false # no match found
45
+ end
46
+ end
47
+
48
+ def match_og( value ) # plato (stammwuerze/gravity?) e.g. 11.2°
49
+ if value =~ /^(\d+(?:\.\d+)?)°$/
50
+ # nb: no whitespace allowed between ° and number e.g. 11.2°
51
+ yield( $1.to_f ) # convert to decimal? how? use float?
52
+ true # bingo - match found
53
+ else
54
+ false # no match found
55
+ end
56
+ end
57
+
58
+ def match_kcal( value )
59
+ if value =~ /^(\d+(?:\.\d+)?)\s*kcal(?:\/100ml)?$/ # kcal
60
+ # nb: allow 44.4 kcal/100ml or 44.4 kcal or 44.4kcal
61
+ yield( $1.to_f ) # convert to decimal? how? use float?
62
+ true # bingo - match found
63
+ else
64
+ false # no match found
65
+ end
66
+ end
67
+
68
+ def match_hl( value ) # hector liters (hl) 1hl = 100l
69
+ if value =~ /^(?:([0-9][0-9_ ]+[0-9]|[0-9]{1,2})\s*hl)$/ # e.g. 20_000 hl or 50hl etc.
70
+ yield( $1.gsub( /[ _]/, '' ).to_i )
71
+ true # bingo - match found
72
+ else
73
+ false # no match found
74
+ end
75
+ end
76
+
77
+ end # module ValueHelper
78
+ end # module TextUtils
@@ -2,6 +2,55 @@
2
2
 
3
3
  # fix: move into TextUtils namespace/module!! ??
4
4
 
5
+
6
+ class NameTokenizer ## - rename to NameScanner, NameSplitter, NameSeparator, etc.
7
+
8
+ ## split (single) string value into array of names
9
+ ## e.g.
10
+ ## 'München [Munich]' => ['München', '[Munich]']
11
+ ## 'Wr. Neustadt | Wiener Neustadt' => ['Wr. Neustadt', 'Wiener Neustadt']
12
+ include LogUtils::Logging
13
+
14
+ def tokenize( value ) ## rename to/use split - why? why not??
15
+ names = []
16
+
17
+ # 1) split by | (pipe) -- remove leading n trailing whitespaces
18
+ parts = value.split( /[ \t]*\|[ \t]*/ )
19
+
20
+ # 2) split "inline" translations e.g. München [Munich]
21
+
22
+ ## todo: add support for Munich [en] e.g. trailing lang tag
23
+ ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??
24
+
25
+ parts.each do |part|
26
+ s = StringScanner.new( part )
27
+ s.skip( /[ \t]+/) # skip whitespaces
28
+
29
+ while s.eos? == false
30
+ if s.check( /\[/ )
31
+ ## scan everything until the end of bracket (e.g.])
32
+ name = s.scan( /\[[^\]]+\]/)
33
+ ## todo/fix: if name nil - issue warning??
34
+ # starting w/ [ but no closing ] found !!!! - possible? fix!!
35
+ else
36
+ ## scan everything until the begin of bracket (e.g.[)
37
+ name = s.scan( /[^\[]+/)
38
+ name = name.rstrip ## remove trailing spaces (if present)
39
+ end
40
+ names << name
41
+
42
+ s.skip( /[ \t]+/) # skip whitespaces
43
+ logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
44
+ end
45
+ end # each part
46
+
47
+ logger.debug( "[NameTokenizer] names=#{names.inspect}")
48
+ names
49
+ end # method split
50
+ end # class NameTokenizer
51
+
52
+
53
+ =begin
5
54
  class NameParser
6
55
 
7
56
  include LogUtils::Logging
@@ -9,4 +58,4 @@ class NameParser
9
58
  ## to be done
10
59
 
11
60
  end # class NameParser
12
-
61
+ =end
@@ -4,7 +4,7 @@ module TextUtils
4
4
 
5
5
  MAJOR = 1 ## todo: namespace inside version or something - why? why not??
6
6
  MINOR = 2
7
- PATCH = 3
7
+ PATCH = 4
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -1,5 +1,9 @@
1
1
  # encoding: utf-8
2
2
 
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_title_finder.rb
6
+
3
7
 
4
8
  require 'helper'
5
9
 
@@ -8,18 +12,43 @@ class TestTitleFinder < Minitest::Test
8
12
 
9
13
  include TextUtils::ValueHelper # lets us use find_grade, etc.
10
14
 
11
- def test_grade
12
15
 
16
+ def test_find_key_n_title
17
+ attribs, _ = find_key_n_title( ['München [Munich]'] ) ## skip returned more_values (use _)
18
+ assert_equal 'muenchen', attribs[:key]
19
+ assert_equal 'München', attribs[:title]
20
+ assert_equal '[Munich]', attribs[:synonyms]
21
+ end
22
+
23
+ def test_find_key_n_title_w_tree
24
+ attribs, _ = find_key_n_title( ['München [Munich] › Oberbayern › Bayern'] ) ## skip returned more_values (use _)
25
+ assert_equal 'muenchen', attribs[:key]
26
+ assert_equal 'München', attribs[:title]
27
+ assert_equal '[Munich]', attribs[:synonyms]
28
+ end
29
+
30
+
31
+ def test_title_tokenizer
32
+ names = NameTokenizer.new.tokenize( 'München [Munich]' )
33
+ assert_equal 2, names.size
34
+ assert_equal 'München', names[0]
35
+ assert_equal '[Munich]', names[1]
36
+
37
+ names = NameTokenizer.new.tokenize( 'FC Bayern Muenchen|Bayern Muenchen|Bayern' )
38
+ assert_equal 3, names.size
39
+ assert_equal 'FC Bayern Muenchen', names[0]
40
+ assert_equal 'Bayern Muenchen', names[1]
41
+ assert_equal 'Bayern', names[2]
42
+ end
43
+
44
+ def test_grade
13
45
  assert_equal [1,'Anton Bauer'], find_grade( '*** Anton Bauer' )
14
46
  assert_equal [2,'Anton Bauer'], find_grade( '** Anton Bauer' )
15
47
  assert_equal [3,'Anton Bauer'], find_grade( '* Anton Bauer' )
16
48
  assert_equal [4,'Anton Bauer'], find_grade( 'Anton Bauer' )
17
49
 
18
50
  assert_equal [1,'Anton Bauer'], find_grade( 'Anton Bauer ***' )
19
-
20
51
  end
21
52
 
22
-
23
-
24
53
  end # class TestTitleFinder
25
54
 
@@ -62,7 +62,8 @@ class TestTitleHelper < Minitest::Test
62
62
  [ "Ṣan'ā' [Sana'a]", 'sana'],
63
63
  [ "P'yŏngyang [Pyongyang]", 'pyongyang' ],
64
64
  [ 'Kāṭhmāḍaũ [Kathmandu]', 'kathmadau' ],
65
- [ "Pe\u{030C}awar", 'pexawar'] ## note: use unicode literal; Pex̌awar -- see en.wikipedia.org/wiki/Peshawar
65
+ [ "Pe\u{030C}awar", 'pexawar'], ## note: use unicode literal; Pex̌awar -- see en.wikipedia.org/wiki/Peshawar
66
+ [ '1850 München', '1850muenchen'],
66
67
  ]
67
68
 
68
69
  txt_io.each do |txt|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-11 00:00:00.000000000 Z
11
+ date: 2015-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: props
@@ -124,7 +124,9 @@ files:
124
124
  - lib/textutils/helper/tag_helper.rb
125
125
  - lib/textutils/helper/title_helper.rb
126
126
  - lib/textutils/helper/unicode_helper.rb
127
- - lib/textutils/helper/value_helper.rb
127
+ - lib/textutils/helper/value_helper_i.rb
128
+ - lib/textutils/helper/value_helper_ii.rb
129
+ - lib/textutils/helper/value_helper_iii_numbers.rb
128
130
  - lib/textutils/helper/xml_helper.rb
129
131
  - lib/textutils/page.rb
130
132
  - lib/textutils/parser/name_parser.rb
@@ -1,249 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module TextUtils
5
- module ValueHelper
6
-
7
- #####
8
- ## fix: move to beerdb ??? why? why not?? - yes, move to beerdb-models
9
-
10
- def match_brewery( value )
11
- if value =~ /^by:/ ## by: -brewed by/brewery
12
- brewery_key = value[3..-1] ## cut off by: prefix
13
- brewery = BeerDb::Model::Brewery.find_by_key!( brewery_key )
14
- yield( brewery )
15
- true # bingo - match found
16
- else
17
- false # no match found
18
- end
19
- end
20
-
21
-
22
- def is_year?( value )
23
- # founded/established year e.g. 1776
24
- match_result = value =~ /^[0-9]{4}$/
25
- # match found if 0,1,2,3 etc or no match if nil
26
- # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
27
- match_result != nil
28
- end
29
-
30
-
31
- def match_year( value )
32
- if is_year?( value ) # founded/established year e.g. 1776
33
- yield( value.to_i )
34
- true # bingo - match found
35
- else
36
- false # no match found
37
- end
38
- end
39
-
40
-
41
- def match_km_squared( value )
42
- ## allow numbers like 453 km² or 45_000 km2
43
- if value =~ /^([0-9][0-9 _]+[0-9]|[0-9]{1,2})(?:\s*(?:km2|km²)\s*)$/
44
- num = value.gsub( 'km2', '').gsub( 'km²', '' ).gsub(/[ _]/, '').to_i
45
- yield( num )
46
- true # bingo - match found
47
- else
48
- false # no match found
49
- end
50
- end
51
-
52
- def match_number( value )
53
- ## numeric (nb: can use any _ or spaces inside digits e.g. 1_000_000 or 1 000 000)
54
- if value =~ /^([0-9][0-9 _]+[0-9])|([0-9]{1,2})$/
55
- num = value.gsub(/[ _]/, '').to_i
56
- yield( num )
57
- true # bingo - match found
58
- else
59
- false # no match found
60
- end
61
- end
62
-
63
-
64
- def match_abv( value ) # alcohol by volume (abv) e.g. 5.2%
65
- if value =~ /^<?\s*(\d+(?:\.\d+)?)\s*%$/
66
- # nb: allow leading < e.g. <0.5%
67
- yield( $1.to_f ) # convert to decimal? how? use float?
68
- true # bingo - match found
69
- else
70
- false # no match found
71
- end
72
- end
73
-
74
- def match_og( value ) # plato (stammwuerze/gravity?) e.g. 11.2°
75
- if value =~ /^(\d+(?:\.\d+)?)°$/
76
- # nb: no whitespace allowed between ° and number e.g. 11.2°
77
- yield( $1.to_f ) # convert to decimal? how? use float?
78
- true # bingo - match found
79
- else
80
- false # no match found
81
- end
82
- end
83
-
84
- def match_kcal( value )
85
- if value =~ /^(\d+(?:\.\d+)?)\s*kcal(?:\/100ml)?$/ # kcal
86
- # nb: allow 44.4 kcal/100ml or 44.4 kcal or 44.4kcal
87
- yield( $1.to_f ) # convert to decimal? how? use float?
88
- true # bingo - match found
89
- else
90
- false # no match found
91
- end
92
- end
93
-
94
- def match_hl( value ) # hector liters (hl) 1hl = 100l
95
- if value =~ /^(?:([0-9][0-9_ ]+[0-9]|[0-9]{1,2})\s*hl)$/ # e.g. 20_000 hl or 50hl etc.
96
- yield( $1.gsub( /[ _]/, '' ).to_i )
97
- true # bingo - match found
98
- else
99
- false # no match found
100
- end
101
- end
102
-
103
-
104
- def is_website?( value )
105
- # check for url/internet address e.g. www.ottakringer.at
106
- # - must start w/ www. or
107
- # - must end w/ .com
108
- #
109
- # fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
110
- match_result = value =~ /^www\.|\.com$/
111
- # match found if 0,1,2,3 etc or no match if nil
112
- # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
113
- match_result != nil
114
- end
115
-
116
- def match_website( value )
117
- if is_website?( value ) # check for url/internet address e.g. www.ottakringer.at
118
- # fix: support more url format (e.g. w/o www. - look for .com .country code etc.)
119
- yield( value )
120
- true # bingo - match found
121
- else
122
- false # no match found
123
- end
124
- end
125
-
126
-
127
-
128
- def is_address?( value )
129
- # if value includes // assume address e.g. 3970 Weitra // Sparkasseplatz 160
130
- match_result = value =~ /\/{2}/
131
- # match found if 0,1,2,3 etc or no match if nil
132
- # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
133
- match_result != nil
134
- end
135
-
136
- def is_taglist?( value )
137
- ### note: cannot start w/ number must be letter for now
138
- ## -- in the future allow free standing years (e.g. 1980 etc.?? why? why not?)
139
- ## e.g. not allowed 14 ha or 5_000 hl etc.
140
- match_result = value =~ /^([a-z][a-z0-9\|_ ]*[a-z0-9]|[a-z])$/
141
- # match found if 0,1,2,3 etc or no match if nil
142
- # note: return bool e.g. false|true (not 0,1,2,3 etc. and nil)
143
- match_result != nil
144
- end
145
-
146
-
147
- def find_grade( value ) # NB: returns ary [grade,value] / two values
148
- grade = 4 # defaults to grade 4 e.g *** => 1, ** => 2, * => 3, -/- => 4
149
-
150
- # NB: stars must end field/value or start field/value
151
- # e.g.
152
- # *** Anton Bauer or
153
- # Anton Bauer ***
154
-
155
- value = value.sub( /^\s*(\*{1,3})\s+/ ) do |_|
156
- if $1 == '***'
157
- grade = 1
158
- elsif $1 == '**'
159
- grade = 2
160
- elsif $1 == '*'
161
- grade = 3
162
- else
163
- # unknown grade; not possible, is'it?
164
- end
165
- '' # remove * from title if found
166
- end
167
-
168
- value = value.sub( /\s+(\*{1,3})\s*$/ ) do |_|
169
- if $1 == '***'
170
- grade = 1
171
- elsif $1 == '**'
172
- grade = 2
173
- elsif $1 == '*'
174
- grade = 3
175
- else
176
- # unknown grade; not possible, is'it?
177
- end
178
- '' # remove * from title if found
179
- end
180
-
181
- [grade,value]
182
- end
183
-
184
-
185
- def find_key_n_title( values ) # NB: returns ary [attribs,more_values] / two values
186
-
187
- ## fix: add/configure logger for ActiveRecord!!!
188
- logger = LogKernel::Logger.root
189
-
190
- ### todo/fix: allow check - do NOT allow mixed use of with key and w/o key
191
- ## either use keys or do NOT use keys; do NOT mix in a single fixture file
192
-
193
- ### support autogenerate key from first title value
194
-
195
- # if it looks like a key (only a-z lower case allowed); assume it's a key
196
- # - also allow . in keys e.g. world.quali.america, at.cup, etc.
197
- # - also allow 0-9 in keys e.g. at.2, at.3.1, etc.
198
-
199
- # fix/todo: add support for leading underscore _
200
- # or allow keys starting w/ digits?
201
-
202
- # NB: key must start w/ a-z letter (NB: minimum one letter possible)
203
- if values[0] =~ /^([a-z][a-z0-9.]*[a-z0-9]|[a-z])$/
204
- key_col = values[0]
205
- title_col = values[1]
206
- more_values = values[2..-1]
207
- else
208
- key_col = '<auto>'
209
- title_col = values[0]
210
- more_values = values[1..-1]
211
- end
212
-
213
- attribs = {}
214
-
215
- ## check title_col for grade (e.g. ***/**/*) and use returned stripped title_col if exits
216
- grade, title_col = find_grade( title_col )
217
-
218
- # NB: for now - do NOT include default grade e.g. if grade (***/**/*) not present; attrib will not be present too
219
- if grade == 1 || grade == 2 || grade == 3 # grade found/present
220
- logger.debug " found grade #{grade} in title"
221
- attribs[:grade] = grade
222
- end
223
-
224
- ## fix/todo: add find parts ??
225
- # e.g. ‹Estrella› ‹Damm› Inedit
226
- # becomes => title: 'Estrella Damm Inedit' and parts: ['Estrella','Damm']
227
-
228
- ## title (split of optional synonyms)
229
- # e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
230
- titles = title_col.split('|')
231
-
232
- attribs[ :title ] = titles[0]
233
-
234
- ## add optional synonyms if present
235
- attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
236
-
237
- if key_col == '<auto>'
238
- ## autogenerate key from first title
239
- key_col = TextUtils.title_to_key( titles[0] )
240
- logger.debug " autogen key »#{key_col}« from title »#{titles[0]}«"
241
- end
242
-
243
- attribs[ :key ] = key_col
244
-
245
- [attribs, more_values]
246
- end
247
-
248
- end # module ValueHelper
249
- end # module TextUtils