textutils 1.2.4 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 16e24e7bc0a1004bc3fca49b6f3bfcaa6ce2e5ce
4
- data.tar.gz: 6aa074466c9c89b089ecf0ac6d75164050abd0cb
3
+ metadata.gz: cb4855c8da2e08a23a8c84f436058695ca79b5d2
4
+ data.tar.gz: 7373f05346e5939481ab569bc411d37b8d155a09
5
5
  SHA512:
6
- metadata.gz: f386324301ffc37deba32eb32202edf9b8706ff1b14971ba5a41db22a2d9a314be64914b4296e359cea2620a6b54ad89ae1cfa6e7c9fa1322ec4ef29020fb688
7
- data.tar.gz: d56300976f712b8bb9de8f1e2e571948f35783090d1d7f1a238736f8efd6a60aba915d5ff807cb942a38f47f80b8d6917e6e1e846371b72b62022c8b3ad029f0
6
+ metadata.gz: 547e53097dcc8ca0bade46448b7fd26998cc4ff333b33142bddb0db6b8654e004d78498c01bdf27fb9c2b10578e8e5808f89e7aa632ec73ffa1f114dbf693979
7
+ data.tar.gz: d9d26f569c2ebd76766c6ae8d592e08310f5632c40a78dc80f04d2085b031551a7f61f21cca046bce8fcbd922584a273540346f9d26863b41d6093f956bf828e
data/Manifest.txt CHANGED
@@ -24,12 +24,14 @@ lib/textutils/helper/value_helper_iii_numbers.rb
24
24
  lib/textutils/helper/xml_helper.rb
25
25
  lib/textutils/page.rb
26
26
  lib/textutils/parser/name_parser.rb
27
+ lib/textutils/parser/name_tokenizer.rb
27
28
  lib/textutils/patterns.rb
28
29
  lib/textutils/reader/block_reader.rb
29
30
  lib/textutils/reader/code_reader.rb
30
31
  lib/textutils/reader/fixture_reader.rb
31
32
  lib/textutils/reader/hash_reader.rb
32
33
  lib/textutils/reader/line_reader.rb
34
+ lib/textutils/reader/tree_reader.rb
33
35
  lib/textutils/reader/values_reader.rb
34
36
  lib/textutils/sanitizier.rb
35
37
  lib/textutils/title.rb
@@ -37,6 +39,9 @@ lib/textutils/title_mapper.rb
37
39
  lib/textutils/utils.rb
38
40
  lib/textutils/version.rb
39
41
  test/data/cl_all.txt
42
+ test/data/de-deutschland/3--by-bayern/4--oberfranken/orte.txt
43
+ test/data/de-deutschland/3--by-bayern/4--oberfranken/orte_ii.txt
44
+ test/data/de-deutschland/orte.txt
40
45
  test/data/feedburner.txt
41
46
  test/helper.rb
42
47
  test/test_address_helper.rb
@@ -49,5 +54,6 @@ test/test_taglist.rb
49
54
  test/test_title_finder.rb
50
55
  test/test_title_helper.rb
51
56
  test/test_title_mapper.rb
57
+ test/test_tree_reader.rb
52
58
  test/test_unicode_helper.rb
53
59
  test/test_values_reader.rb
data/lib/textutils.rb CHANGED
@@ -52,8 +52,8 @@ require 'textutils/core_ext/file'
52
52
  require 'textutils/core_ext/time'
53
53
  require 'textutils/core_ext/array'
54
54
 
55
-
56
55
  require 'textutils/parser/name_parser'
56
+ require 'textutils/parser/name_tokenizer'
57
57
 
58
58
  require 'textutils/reader/code_reader'
59
59
  require 'textutils/reader/hash_reader'
@@ -61,6 +61,7 @@ require 'textutils/reader/line_reader'
61
61
  require 'textutils/reader/values_reader'
62
62
  require 'textutils/reader/fixture_reader'
63
63
  require 'textutils/reader/block_reader'
64
+ require 'textutils/reader/tree_reader'
64
65
 
65
66
  require 'textutils/classifier'
66
67
  require 'textutils/title' # title table/mapper/finder utils
@@ -71,4 +72,4 @@ require 'textutils/page' # for book pages and page templates
71
72
 
72
73
 
73
74
  # say hello
74
- puts TextUtils.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
75
+ puts TextUtils.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
@@ -2,60 +2,65 @@
2
2
 
3
3
  # fix: move into TextUtils namespace/module!! ??
4
4
 
5
+ class NameParser
5
6
 
6
- class NameTokenizer ## - rename to NameScanner, NameSplitter, NameSeparator, etc.
7
-
8
- ## split (single) string value into array of names
9
- ## e.g.
10
- ## 'München [Munich]' => ['München', '[Munich]']
11
- ## 'Wr. Neustadt | Wiener Neustadt' => ['Wr. Neustadt', 'Wiener Neustadt']
12
7
  include LogUtils::Logging
13
8
 
14
- def tokenize( value ) ## rename to/use split - why? why not??
15
- names = []
9
+ def parse( chunks )
10
+ ## todo/fix: (re)use nameparser - for now "simple" inline version
11
+ ## fix!!! - note: for now lang gets ignored
12
+ ## fix: add hanlde
13
+ ## Leuven[nl]|Louvain[fr] Löwen[de]
14
+ ## Antwerpen[nl]|Anvers[fr] [Antwerp]
15
+ ## Brussel[nl]•Bruxelles[fr] -> official bi-lingual name
16
+ ## etc.
16
17
 
17
- # 1) split by | (pipe) -- remove leading n trailing whitespaces
18
- parts = value.split( /[ \t]*\|[ \t]*/ )
18
+ ## values - split into names (name n lang pairs)
19
+ ## note: assumes (default) lang from more_attribs unless otherwise marked e.g. [] assume en etc.
19
20
 
20
- # 2) split "inline" translations e.g. München [Munich]
21
+ ## split chunks into values
22
+ values = []
23
+ chunks.each do |chunk|
24
+ next if chunk.nil? || chunk.blank? ## skip nil or empty/blank chunks
21
25
 
22
- ## todo: add support for Munich [en] e.g. trailing lang tag
23
- ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??
26
+ parts = chunk.split( '|' ) # 1) split |
24
27
 
25
- parts.each do |part|
28
+ parts.each do |part|
26
29
  s = StringScanner.new( part )
27
30
  s.skip( /[ \t]+/) # skip whitespaces
28
31
 
29
32
  while s.eos? == false
30
33
  if s.check( /\[/ )
31
34
  ## scan everything until the end of bracket (e.g.])
32
- name = s.scan( /\[[^\]]+\]/)
33
- ## todo/fix: if name nil - issue warning??
34
- # starting w/ [ but no closing ] found !!!! - possible? fix!!
35
+ ## fix!!! - note: for now lang gets ignored
36
+ value = s.scan( /\[[^\]]+\]/)
37
+ value = value[1...-1] # strip enclosing [] e.g. [Bavaria] => Bavaria
35
38
  else
36
39
  ## scan everything until the begin of bracket (e.g.[)
37
- name = s.scan( /[^\[]+/)
38
- name = name.rstrip ## remove trailing spaces (if present)
40
+ value = s.scan( /[^\[]+/)
41
+ value = value.strip
39
42
  end
40
- names << name
43
+ values << value
41
44
 
42
45
  s.skip( /[ \t]+/) # skip whitespaces
43
- logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
46
+ logger.debug( "[NameParser] eos?: #{s.eos?}, rest: >#{s.rest}<" )
44
47
  end
45
- end # each part
46
-
47
- logger.debug( "[NameTokenizer] names=#{names.inspect}")
48
- names
49
- end # method split
50
- end # class NameTokenizer
48
+ end
49
+ end
51
50
 
51
+ logger.debug( "[NameParser] values=#{values.inspect}")
52
52
 
53
- =begin
54
- class NameParser
53
+ names = []
54
+ values.each do |value|
55
+ name = value
56
+ ## todo: split by bullet ? (official multilang name) e.g. Brussel • Bruxelles
57
+ ## todo: process variants w/ () e.g. Krems (a. d. Donau) etc. ??
58
+ names << name
59
+ end
55
60
 
56
- include LogUtils::Logging
61
+ logger.debug( "[NameParser] names=#{names.inspect}")
57
62
 
58
- ## to be done
59
-
63
+ names
64
+ end # method parse
60
65
  end # class NameParser
61
- =end
66
+
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ # fix: move into TextUtils namespace/module!! ??
4
+
5
+
6
+ class NameTokenizer ## - rename to NameScanner, NameSplitter, NameSeparator, etc.
7
+
8
+ ## split (single) string value into array of names
9
+ ## e.g.
10
+ ## 'München [Munich]' => ['München', '[Munich]']
11
+ ## 'Wr. Neustadt | Wiener Neustadt' => ['Wr. Neustadt', 'Wiener Neustadt']
12
+ include LogUtils::Logging
13
+
14
+ def tokenize( value ) ## rename to/use split - why? why not??
15
+ names = []
16
+
17
+ # 1) split by | (pipe) -- remove leading n trailing whitespaces
18
+ parts = value.split( /[ \t]*\|[ \t]*/ )
19
+
20
+ # 2) split "inline" translations e.g. München [Munich]
21
+
22
+ ## todo: add support for Munich [en] e.g. trailing lang tag
23
+ ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??
24
+
25
+ parts.each do |part|
26
+ s = StringScanner.new( part )
27
+ s.skip( /[ \t]+/) # skip whitespaces
28
+
29
+ while s.eos? == false
30
+ if s.check( /\[/ )
31
+ ## scan everything until the end of bracket (e.g.])
32
+ name = s.scan( /\[[^\]]+\]/)
33
+ ## todo/fix: if name nil - issue warning??
34
+ # starting w/ [ but no closing ] found !!!! - possible? fix!!
35
+ else
36
+ ## scan everything until the begin of bracket (e.g.[)
37
+ name = s.scan( /[^\[]+/)
38
+ name = name.rstrip ## remove trailing spaces (if present)
39
+ end
40
+ names << name
41
+
42
+ s.skip( /[ \t]+/) # skip whitespaces
43
+ logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
44
+ end
45
+ end # each part
46
+
47
+ logger.debug( "[NameTokenizer] names=#{names.inspect}")
48
+ names
49
+ end # method split
50
+ end # class NameTokenizer
51
+
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ # fix: move into TextUtils namespace/module!!
4
+
5
+ class TreeReader
6
+
7
+ include LogUtils::Logging
8
+
9
+ def self.from_file( path )
10
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
11
+ ## - see textutils/utils.rb
12
+ text = File.read_utf8( path )
13
+ self.from_string( text )
14
+ end
15
+
16
+ def self.from_string( text )
17
+ self.new( text )
18
+ end
19
+
20
+ def initialize( text )
21
+ @text = text
22
+ end
23
+
24
+ TreeItem = Struct.new( :level, :key, :value )
25
+
26
+ KEY_REGEX = /
27
+ ([0-9][0-9A-Za-z]*) ## key starting with a nummer
28
+ |
29
+ ([a-z]+) ## key all lowercase e.g. bt,n,etc.
30
+ |
31
+ ([A-Z]+) ## key all uppercase e.g. BT,N,etc
32
+ /x
33
+
34
+ LEVEL_REGEX = /\.+/ ## e.g. .. or .... etc.
35
+
36
+
37
+ def each_line
38
+ stack = [] # note: last_level => stack.size; starts w/ 0
39
+ times = 2 # assume two indents factor (e.g. .. =2, ....=3 etc. ) for now
40
+
41
+ reader = LineReader.from_string( @text )
42
+ reader.each_line do |line|
43
+
44
+ logger.debug "[TreeReader] line (before) => >#{line}<"
45
+
46
+ s = StringScanner.new( line )
47
+ s.skip( /[ \t]+/ ) # remove whitespace
48
+
49
+ key = s.scan( KEY_REGEX )
50
+ if key
51
+ s.skip( /[ \t]+/ ) # remove whitespace
52
+ end
53
+
54
+ level_str = s.scan( LEVEL_REGEX )
55
+ if level_str
56
+ ## FIX!! todo/check: make sure level_str.size is a multiple of two !! (e.g. 2,4,6,etc.)
57
+ level = (level_str.size/times)+1
58
+ s.skip( /[ \t]+/ ) # remove whitespace
59
+ else
60
+ level = 1 ## no level found; assume top level (start w/ 1)
61
+ end
62
+
63
+ ## assume rest is record
64
+ rest = s.rest ## was: s.scan( /.+/ )
65
+
66
+ level_diff = level - stack.size
67
+
68
+ if level_diff > 0
69
+ logger.debug "[TreeReader] up +#{level_diff}"
70
+ ## FIX!!! todo/check/verify/assert: always must be +1
71
+ elsif level_diff < 0
72
+ logger.debug "[TreeReader] down #{level_diff}"
73
+ level_diff.abs.times { stack.pop }
74
+ stack.pop
75
+ else
76
+ ## same level
77
+ stack.pop
78
+ end
79
+
80
+ item = TreeItem.new
81
+ item.level = level
82
+ item.key = key
83
+ item.value = rest
84
+
85
+ stack.push( item )
86
+
87
+ ## for debugging - show tree item (note) hierarchy
88
+ names = stack.map { |it| "(#{it.level}) #{it.value}" }
89
+ logger.debug "[TreeReader] #{names.join( ' › ' )} -- key: >#{key}<, level: >#{level}<, rest: >#{rest}<"
90
+
91
+ yield( stack )
92
+ end
93
+ end # method each_line
94
+
95
+ end # class TreeReader
96
+
@@ -3,8 +3,8 @@
3
3
  module TextUtils
4
4
 
5
5
  MAJOR = 1 ## todo: namespace inside version or something - why? why not??
6
- MINOR = 2
7
- PATCH = 4
6
+ MINOR = 3
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -0,0 +1,103 @@
1
+ 2 Bayern
2
+ 24 .. Oberfranken
3
+ 241 .... Bamberg (Stadt) ## Kreisfreie Stadt
4
+ ...... Bamberg
5
+ ........ Bamberg
6
+ 242 .... Bayreuth (Stadt) ## Kreisfreie Stadt
7
+ ...... Bayreuth
8
+ ........ Bayreuth
9
+ 243 .... Coburg (Stadt) ## Kreisfreie Stadt
10
+ ...... Coburg
11
+ ........ Coburg
12
+ 244 .... Hof (Stadt) ## Kreisfreie Stadt
13
+ ...... Hof
14
+ ........ Hof
15
+
16
+ 245 .... Bamberg (Land) ## Landkreis -- 36 Gemeinden; see de.wikipedia.org/wiki/Landkreis_Bamberg
17
+ ## 4 Städte
18
+ ...... Baunach ## (4013, 30,9 km²)
19
+ ........ Baunach
20
+ ...... Hallstadt ## (8364, 14,5 km²)
21
+ ........ Hallstadt ## (7588)
22
+ ........ Dörfleins ## (1380)
23
+ ...... Scheßlitz ## (7184, 94,9 km²)
24
+ ........ Scheßlitz
25
+ ........ Köttensdorf
26
+ ........ Würgau
27
+ ...... Schlüsselfeld ## (5712, 70,2 km²)
28
+
29
+ ## 8 Märkte
30
+ ...... Burgebrach ## (6553, 87,9 km²)
31
+ ...... Burgwindheim ## (1311, 37,4 km²)
32
+ ...... Buttenheim ## (3472, 30 km²)
33
+ ...... Ebrach ## (1830, 29,6 km²)
34
+ ...... Heiligenstadt i. OFr. ## (3525, 76,7 km²)
35
+ ........ Heiligenstadt i. OFr.
36
+ ........ Oberleinleiter
37
+ ...... Hirschaid ## (11.919, 41 km²)
38
+ ...... Rattelsdorf ## (4568, 39,6 km²)
39
+ ........ Rattelsdorf
40
+ ........ Mürsbach
41
+ ........ Freudeneck
42
+ ........ Höfen
43
+ ........ Ebing
44
+ ...... Zapfendorf ## (4954, 30,6 km²)
45
+
46
+ ## 24 Gemeinden
47
+ ...... Altendorf ## (2012, 8,6 km²)
48
+ ...... Bischberg ## (6012, 17,5 km²)
49
+ ...... Breitengüßbach ## (4586, 16,9 km²)
50
+ ........ Breitengüßbach
51
+ ...... Frensdorf ## (4865, 44 km²)
52
+ ...... Gerach ## (946, 7,8 km²)
53
+ ...... Gundelsheim ## (3378, 3,8 km²)
54
+ ...... Kemmern ## (2544, 8,3 km²)
55
+ ........ Kemmern
56
+ ...... Königsfeld ## (1335, 42,7 km²)
57
+ ........ Königsfeld
58
+ ........ Huppendorf
59
+ ...... Lauter ## (1139, 12,8 km²)
60
+ ........ Lauter ## (601)
61
+ ........ Appendorf ## (213)
62
+ ...... Lisberg ## (1813, 8,4 km²)
63
+ ...... Litzendorf ## (6057, 25,9 km²)
64
+ ........ Litzendorf
65
+ ........ Schammelsdorf
66
+ ........ Melkendorf
67
+ ........ Lohndorf
68
+ ........ Tiefenellern
69
+ ...... Memmelsdorf ## (8854, 26,2 km²)
70
+ ........ Memmelsdorf
71
+ ........ Merkendorf
72
+ ........ Drosendorf
73
+ ...... Oberhaid ## (4590, 27,2 km²)
74
+ ........ Oberhaid
75
+ ........ Staffelbach
76
+ ...... Pettstadt ## (1940, 9,9 km²)
77
+ ...... Pommersfelden ## (2851, 35,7 km²)
78
+ ...... Priesendorf ## (1470, 8,4 km²)
79
+ ...... Reckendorf ## (2033, 13,1 km²)
80
+ ........ Reckendorf
81
+ ...... Schönbrunn im Steigerwald ## (1880, 24,7 km²)
82
+ ...... Stadelhofen ## (1250, 41 km²)
83
+ ........ Stadelhofen
84
+ ........ Steinfeld
85
+ ........ Schederndorf
86
+ ...... Stegaurach ## (6842, 23,9 km²)
87
+ ...... Strullendorf ## (7807, 31,7 km²)
88
+ ........ Strullendorf
89
+ ........ Geisfeld
90
+ ........ Roßdorf am Forst
91
+ ...... Viereth-Trunstadt ## (3562, 15,8 km²)
92
+ ...... Walsdorf ## (2575, 16,2 km²)
93
+ ...... Wattendorf ## (679, 22,2 km²)
94
+ ........ Wattendorf
95
+
96
+ 246 .... Bayreuth (Land) ## Landkreis
97
+ 247 .... Coburg (Land) ## Landkreis
98
+ 248 .... Forchheim
99
+ 249 .... Hof (Land) ## Landkreis
100
+ 24A .... Kronach
101
+ 24B .... Kulmbach
102
+ 24C .... Lichtenfels
103
+ 24D .... Wunsiedel i. Fichtelgebirge
@@ -0,0 +1,17 @@
1
+ 2 Bayern
2
+ 24 .. Oberfranken
3
+ 241 .... Bamberg (Stadt) ## Kreisfreie Stadt
4
+ ...... Bamberg
5
+ ........ Bamberg
6
+ 242 .... Bayreuth (Stadt) ## Kreisfreie Stadt
7
+ ...... Bayreuth
8
+ ........ Bayreuth
9
+
10
+ 245 .... Bamberg (Land) ## Landkreis -- 36 Gemeinden; see de.wikipedia.org/wiki/Landkreis_Bamberg
11
+ ## 4 Städte
12
+ ...... Baunach ## (4013, 30,9 km²)
13
+ ........ Baunach
14
+ ...... Hallstadt ## (8364, 14,5 km²)
15
+ ........ Hallstadt ## (7588)
16
+ ........ Dörfleins ## (1380)
17
+
@@ -0,0 +1,12 @@
1
+ 2 Bayern
2
+ 24 .. Oberfranken
3
+ 241 .... Bamberg (Stadt) ## Kreisfreie Stadt
4
+ ...... Bamberg
5
+ ........ Bamberg
6
+
7
+ #####
8
+ # todo: for testing add berlin and some more
9
+
10
+ 9 Berlin
11
+ 91 .. Berlin
12
+
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_tree_reader.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestTreeReader < MiniTest::Test
11
+
12
+ def test_oberfranken
13
+ reader = TreeReader.from_file( "#{TextUtils.root}/test/data/de-deutschland/3--by-bayern/4--oberfranken/orte.txt" )
14
+
15
+ reader.each_line do |_|
16
+ ## do nothing for now
17
+ end
18
+
19
+ assert true ## assume everything ok if we get here
20
+ end
21
+
22
+ def test_de
23
+ reader = TreeReader.from_file( "#{TextUtils.root}/test/data/de-deutschland/orte.txt" )
24
+
25
+ reader.each_line do |_|
26
+ ## do nothing for now
27
+ end
28
+
29
+ assert true ## assume everything ok if we get here
30
+ end
31
+
32
+ end # class TestTreeReader
33
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
@@ -130,12 +130,14 @@ files:
130
130
  - lib/textutils/helper/xml_helper.rb
131
131
  - lib/textutils/page.rb
132
132
  - lib/textutils/parser/name_parser.rb
133
+ - lib/textutils/parser/name_tokenizer.rb
133
134
  - lib/textutils/patterns.rb
134
135
  - lib/textutils/reader/block_reader.rb
135
136
  - lib/textutils/reader/code_reader.rb
136
137
  - lib/textutils/reader/fixture_reader.rb
137
138
  - lib/textutils/reader/hash_reader.rb
138
139
  - lib/textutils/reader/line_reader.rb
140
+ - lib/textutils/reader/tree_reader.rb
139
141
  - lib/textutils/reader/values_reader.rb
140
142
  - lib/textutils/sanitizier.rb
141
143
  - lib/textutils/title.rb
@@ -143,6 +145,9 @@ files:
143
145
  - lib/textutils/utils.rb
144
146
  - lib/textutils/version.rb
145
147
  - test/data/cl_all.txt
148
+ - test/data/de-deutschland/3--by-bayern/4--oberfranken/orte.txt
149
+ - test/data/de-deutschland/3--by-bayern/4--oberfranken/orte_ii.txt
150
+ - test/data/de-deutschland/orte.txt
146
151
  - test/data/feedburner.txt
147
152
  - test/helper.rb
148
153
  - test/test_address_helper.rb
@@ -155,6 +160,7 @@ files:
155
160
  - test/test_title_finder.rb
156
161
  - test/test_title_helper.rb
157
162
  - test/test_title_mapper.rb
163
+ - test/test_tree_reader.rb
158
164
  - test/test_unicode_helper.rb
159
165
  - test/test_values_reader.rb
160
166
  homepage: https://github.com/textkit/textutils
@@ -188,6 +194,7 @@ test_files:
188
194
  - test/test_fixture_reader.rb
189
195
  - test/test_unicode_helper.rb
190
196
  - test/test_asciify.rb
197
+ - test/test_tree_reader.rb
191
198
  - test/test_title_mapper.rb
192
199
  - test/test_values_reader.rb
193
200
  - test/test_taglist.rb