textutils 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,3 +7,9 @@ lib/textutils/filter/code_filter.rb
7
7
  lib/textutils/filter/comment_filter.rb
8
8
  lib/textutils/filter/erb_django_filter.rb
9
9
  lib/textutils/filter/erb_filter.rb
10
+ lib/textutils/reader/code_reader.rb
11
+ lib/textutils/reader/hash_reader.rb
12
+ lib/textutils/reader/line_reader.rb
13
+ lib/textutils/reader/values_reader.rb
14
+ lib/textutils/utils.rb
15
+ lib/textutils/version.rb
data/Rakefile CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'hoe'
2
- require './lib/textutils.rb'
2
+ require './lib/textutils/version.rb'
3
3
 
4
4
  Hoe.spec 'textutils' do
5
5
 
@@ -16,5 +16,16 @@ Hoe.spec 'textutils' do
16
16
  # switch extension to .markdown for gihub formatting
17
17
  self.readme_file = 'README.markdown'
18
18
  self.history_file = 'History.markdown'
19
+
20
+ self.extra_deps = [
21
+ ['logutils', '~> 0.2.0']
22
+ ]
23
+
24
+ self.licenses = ['Public Domain']
25
+
26
+ self.spec_extras = {
27
+ :required_ruby_version => '>= 1.9.2'
28
+ }
19
29
 
30
+
20
31
  end
@@ -8,17 +8,23 @@ require 'optparse'
8
8
  require 'fileutils'
9
9
  require 'erb'
10
10
 
11
+ # 3rd party gems / libs
12
+
13
+ require 'logutils'
11
14
 
12
15
  # our own code
13
16
 
17
+ require 'textutils/version'
18
+
14
19
  require 'textutils/filter/code_filter'
15
20
  require 'textutils/filter/comment_filter'
16
21
  require 'textutils/filter/erb_django_filter'
17
22
  require 'textutils/filter/erb_filter'
18
23
 
24
+ require 'textutils/utils'
25
+ require 'textutils/reader/code_reader'
26
+ require 'textutils/reader/hash_reader'
27
+ require 'textutils/reader/line_reader'
28
+ require 'textutils/reader/values_reader'
19
29
 
20
- module TextUtils
21
-
22
- VERSION = '0.2.0'
23
30
 
24
- end # module TextUtils
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ class CodeReader
4
+
5
+ def initialize( logger=nil, path )
6
+ if logger.nil?
7
+ @logger = Logger.new(STDOUT)
8
+ @logger.level = Logger::INFO
9
+ else
10
+ @logger = logger
11
+ end
12
+
13
+ @path = path
14
+
15
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
16
+ ## - see worlddb/utils.rb
17
+
18
+ @code = File.read_utf8( @path )
19
+ end
20
+
21
+ def eval( klass )
22
+ klass.class_eval( @code )
23
+
24
+ # NB: same as
25
+ #
26
+ # module WorldDB
27
+ # include WorldDB::Models
28
+ # <code here>
29
+ # end
30
+ end
31
+
32
+ attr_reader :logger
33
+
34
+ end # class CodeReader
@@ -0,0 +1,109 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class HashReader
5
+
6
+ def initialize( logger=nil, path )
7
+ if logger.nil?
8
+ @logger = Logger.new(STDOUT)
9
+ @logger.level = Logger::INFO
10
+ else
11
+ @logger = logger
12
+ end
13
+
14
+ @path = path
15
+
16
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
17
+ ## - see worlddb/utils.rb
18
+
19
+ text = File.read_utf8( @path )
20
+
21
+ ### hack for syck yaml parser (e.g.ruby 1.9.2) (cannot handle !!null)
22
+ ## change it to !null to get plain nil
23
+ ## w/ both syck and psych/libyml
24
+
25
+ text = text.gsub( '!!null', '!null' )
26
+
27
+ ### hacks for yaml
28
+
29
+ ### see yaml gotschas
30
+ ## - http://www.perlmonks.org/?node_id=738671
31
+ ## -
32
+
33
+ ## replace all tabs w/ two spaces and issue a warning
34
+ ## nb: yaml does NOT support tabs see why here -> yaml.org/faq.html
35
+
36
+ text = text.gsub( "\t" ) do |_|
37
+ puts "*** warn: hash reader - found tab (\t) replacing w/ two spaces; yaml forbids tabs; see yaml.org/faq.html"
38
+ ' ' # replace w/ two spaces
39
+ end
40
+
41
+ ## quote implicit boolean types on,no,n,y
42
+
43
+ ## nb: escape only if key e.g. no: or "free standing" value on its own line e.g.
44
+ ## no: no
45
+
46
+ text = text.gsub( /^([ ]*)(ON|On|on|NO|No|no|N|n|Y|y)[ ]*:/ ) do |value|
47
+ puts "*** warn: hash reader - found implicit bool (#{$1}#{$2}) for key; adding quotes to turn into string; see yaml.org/refcard.html"
48
+ # nb: preserve leading spaces for structure - might be significant
49
+ "#{$1}'#{$2}':" # add quotes to turn it into a string (not bool e.g. true|false)
50
+ end
51
+
52
+ ## nb: value must be freestanding (only allow optional eol comment)
53
+ ## do not escape if part of string sequence e.g.
54
+ ## key: nb,nn,no,se => nb,nn,'no',se -- avoid!!
55
+
56
+ text = text.gsub( /:[ ]+(ON|On|on|NO|No|no|N|n|Y|y)[ ]*($| #.*$)/ ) do |value|
57
+ puts "*** warn: hash reader - found implicit bool (#{$1}) for value; adding quotes to turn into string; see yaml.org/refcard.html"
58
+ ": '#{$1}'" # add quotes to turn it into a string (not bool e.g. true|false)
59
+ end
60
+
61
+
62
+ @hash = YAML.load( text )
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+
68
+ ###
69
+ # nb: returns all values as strings
70
+ #
71
+
72
+ def each
73
+ @hash.each do |key_wild, value_wild|
74
+ # normalize
75
+ # - key n value as string (not symbols, bool? int? array?)
76
+ # - remove leading and trailing whitespace
77
+ key = key_wild.to_s.strip
78
+ value = value_wild.to_s.strip
79
+
80
+ puts "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
81
+
82
+ yield( key, value )
83
+ end
84
+ end # method each
85
+
86
+ ###
87
+ # todo: what name to use: each_object or each_typed ???
88
+ # or use new TypedHashReader class or similar??
89
+
90
+ def each_typed
91
+ @hash.each do |key_wild, value_wild|
92
+ # normalize
93
+ # - key n value as string (not symbols, bool? int? array?)
94
+ # - remove leading and trailing whitespace
95
+ key = key_wild.to_s.strip
96
+
97
+ if value_wild.is_a?( String )
98
+ value = value_wild.strip
99
+ else
100
+ value = value_wild
101
+ end
102
+
103
+ puts "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
104
+
105
+ yield( key, value )
106
+ end
107
+ end # method each
108
+
109
+ end # class HashReader
@@ -0,0 +1,90 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ ## fix/todo: move to/merge into LineReader itself
5
+ # e.g. use fromString c'tor ??? or similar??
6
+
7
+ class StringLineReader
8
+
9
+ def initialize( logger=nil, data )
10
+ if logger.nil?
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.level = Logger::INFO
13
+ else
14
+ @logger = logger
15
+ end
16
+
17
+ @data = data
18
+ end
19
+
20
+ attr_reader :logger
21
+
22
+
23
+ def each_line
24
+ @data.each_line do |line|
25
+
26
+ if line =~ /^\s*#/
27
+ # skip komments and do NOT copy to result (keep comments secret!)
28
+ logger.debug 'skipping comment line'
29
+ next
30
+ end
31
+
32
+ if line =~ /^\s*$/
33
+ # kommentar oder leerzeile überspringen
34
+ logger.debug 'skipping blank line'
35
+ next
36
+ end
37
+
38
+ # remove leading and trailing whitespace
39
+ line = line.strip
40
+
41
+ yield( line )
42
+ end # each lines
43
+ end # method each_line
44
+
45
+ end
46
+
47
+
48
+ class LineReader
49
+
50
+ def initialize( logger=nil, path )
51
+ if logger.nil?
52
+ @logger = Logger.new(STDOUT)
53
+ @logger.level = Logger::INFO
54
+ else
55
+ @logger = logger
56
+ end
57
+
58
+ @path = path
59
+
60
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
61
+ ## - see worlddb/utils.rb
62
+ @data = File.read_utf8( @path )
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+
68
+ def each_line
69
+ @data.each_line do |line|
70
+
71
+ if line =~ /^\s*#/
72
+ # skip komments and do NOT copy to result (keep comments secret!)
73
+ logger.debug 'skipping comment line'
74
+ next
75
+ end
76
+
77
+ if line =~ /^\s*$/
78
+ # kommentar oder leerzeile überspringen
79
+ logger.debug 'skipping blank line'
80
+ next
81
+ end
82
+
83
+ # remove leading and trailing whitespace
84
+ line = line.strip
85
+
86
+ yield( line )
87
+ end # each lines
88
+ end # method each_line
89
+
90
+ end # class LineReader
@@ -0,0 +1,175 @@
1
+ # encoding: utf-8
2
+
3
+ class ValuesReader
4
+
5
+ def initialize( logger, path, more_values={} )
6
+ ## todo: check - can we make logger=nil a default arg too?
7
+ if logger.nil?
8
+ @logger = Logger.new(STDOUT)
9
+ @logger.level = Logger::INFO
10
+ else
11
+ @logger = logger
12
+ end
13
+
14
+ @path = path
15
+
16
+ @more_values = more_values
17
+
18
+ @data = File.read_utf8( @path )
19
+ end
20
+
21
+ attr_reader :logger
22
+
23
+ def each_line
24
+
25
+ @data.each_line do |line|
26
+
27
+ if line =~ /^\s*#/
28
+ # skip komments and do NOT copy to result (keep comments secret!)
29
+ logger.debug 'skipping comment line'
30
+ next
31
+ end
32
+
33
+ if line =~ /^\s*$/
34
+ # kommentar oder leerzeile überspringen
35
+ logger.debug 'skipping blank line'
36
+ next
37
+ end
38
+
39
+
40
+ # pass 1) remove possible trailing eol comment
41
+ ## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
42
+ ## becomes -> nyc, New York
43
+
44
+ line = line.sub( /\s+#.+$/, '' )
45
+
46
+ # pass 2) remove leading and trailing whitespace
47
+
48
+ line = line.strip
49
+
50
+ puts "line: >>#{line}<<"
51
+
52
+ values = line.split(',')
53
+
54
+ # pass 1) remove leading and trailing whitespace for values
55
+
56
+ values = values.map { |value| value.strip }
57
+
58
+ ##### todo remove support of comment column? (NB: must NOT include commas)
59
+ # pass 2) remove comment columns
60
+
61
+ values = values.select do |value|
62
+ if value =~ /^#/ ## start with # treat it as a comment column; e.g. remove it
63
+ puts " removing column with value >>#{value}<<"
64
+ false
65
+ else
66
+ true
67
+ end
68
+ end
69
+
70
+ puts " values: >>#{values.join('<< >>')}<<"
71
+
72
+
73
+ ### todo/fix: allow check - do NOT allow mixed use of with key and w/o key
74
+ ## either use keys or do NOT use keys; do NOT mix in a single fixture file
75
+
76
+
77
+ ### support autogenerate key from first title value
78
+
79
+ # if it looks like a key (only a-z lower case allowed); assume it's a key
80
+ # - also allow . in keys e.g. world.quali.america, at.cup, etc.
81
+
82
+ if values[0] =~ /^[a-z][a-z.]*[a-z]$/ # NB: minimum two a-z letters required
83
+ key_col = values[0]
84
+ title_col = values[1]
85
+ more_cols = values[2..-1]
86
+ else
87
+ key_col = '<auto>'
88
+ title_col = values[0]
89
+ more_cols = values[1..-1]
90
+ end
91
+
92
+ attribs = {}
93
+
94
+ ## title (split of optional synonyms)
95
+ # e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
96
+ titles = title_col.split('|')
97
+
98
+ attribs[ :title ] = titles[0]
99
+
100
+ ## add optional synonyms if present
101
+ attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
102
+
103
+ if key_col == '<auto>'
104
+ ## autogenerate key from first title
105
+ key_col = title_to_key( titles[0] )
106
+ puts " autogen key >#{key_col}< from title >#{titles[0]}<"
107
+ end
108
+
109
+ attribs[ :key ] = key_col
110
+
111
+ attribs = attribs.merge( @more_values ) # e.g. merge country_id and other defaults if present
112
+
113
+ yield( attribs, more_cols )
114
+
115
+ end # each lines
116
+
117
+ end # method each_line
118
+
119
+
120
+
121
+ def title_to_key( title )
122
+
123
+ ## NB: downcase does NOT work for accented chars (thus, include in alternatives)
124
+ key = title.downcase
125
+
126
+ ### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
127
+ key = key.gsub( /\[.+\]/, '' )
128
+
129
+ ## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
130
+ key = key.gsub( /\(.+\)/, '' )
131
+
132
+ ## remove all whitespace and punctuation
133
+ key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
134
+
135
+ ## turn accented char into ascii look alike if possible
136
+ ##
137
+ ## todo: add some more
138
+ ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
139
+
140
+ alternatives = [
141
+ ['ß', 'ss'],
142
+ ['æ', 'ae'],
143
+ ['ä', 'ae'],
144
+ ['á', 'a' ], # e.g. Bogotá, Králové
145
+ ['ã', 'a' ], # e.g São Paulo
146
+ ['ă', 'a' ], # e.g. Chișinău
147
+ ['é', 'e' ], # e.g. Vélez, Králové
148
+ ['è', 'e' ], # e.g. Rivières
149
+ ['ê', 'e' ], # e.g. Grêmio
150
+ ['ě', 'e' ], # e.g. Budějovice
151
+ ['ì', 'i' ], # e.g. Potosì
152
+ ['í', 'i' ], # e.g. Ústí
153
+ ['ñ', 'n' ], # e.g. Porteño
154
+ ['ň', 'n' ], # e.g. Plzeň, Třeboň
155
+ ['ö', 'oe'],
156
+ ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
157
+ ['ř', 'r' ], # e.g. Třeboň
158
+ ['ș', 's' ], # e.g. Chișinău
159
+ ['ü', 'ue'],
160
+ ['ú', 'u' ], # e.g. Fútbol
161
+ ['ź', 'z' ], # e.g. Łódź
162
+ ['Č', 'c' ], # e.g. České
163
+ ['Ł', 'l' ], # e.g. Łódź
164
+ ['Ú', 'u' ], # e.g. Ústí
165
+ ]
166
+
167
+ alternatives.each do |alt|
168
+ key = key.gsub( alt[0], alt[1] )
169
+ end
170
+
171
+ key
172
+ end # method title_to_key
173
+
174
+
175
+ end # class ValuesReader
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class File
5
+ def self.read_utf8( path )
6
+ text = open( path, 'r:bom|utf-8' ) do |file|
7
+ file.read
8
+ end
9
+ ### for convenience for now convert fancy dash to "plain" dash
10
+ ## todo: check char codes for "fancy" dash alternatives
11
+ text.gsub!( /—|–/ ) do |_|
12
+ puts "*** warning: convert fancy dash to 'plain' dash in file >#{path}<"
13
+ ### exit 1 #### do NOT tolerate!! cleanup dashes for now
14
+ '-'
15
+ end
16
+ text
17
+ end
18
+ end # class File
19
+
20
+
21
+ ############
22
+ ### fix/todo: share helper for all text readers/parsers- where to put it?
23
+ ###
24
+
25
+ def title_esc_regex( title_unescaped )
26
+
27
+ ## escape regex special chars e.g. . to \. and ( to \( etc.
28
+ # e.g. Benfica Lis.
29
+ # e.g. Club Atlético Colón (Santa Fe)
30
+
31
+ ## NB: cannot use Regexp.escape! will escape space '' to '\ '
32
+ ## title = Regexp.escape( title_unescaped )
33
+ title = title_unescaped.gsub( '.', '\.' )
34
+ title = title.gsub( '(', '\(' )
35
+ title = title.gsub( ')', '\)' )
36
+
37
+ ## match accented char with or without accents
38
+ ## add (ü|ue) etc.
39
+ ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
40
+
41
+ ## todo: add some more
42
+ ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
43
+ ##
44
+ ## reuse for all readers!
45
+
46
+ alternatives = [
47
+ ['-', '(-| )'], ## e.g. Blau-Weiß Linz
48
+ ['æ', '(æ|ae)'], ## e.g.
49
+ ['á', '(á|a)'], ## e.g. Bogotá, Sársfield
50
+ ['ã', '(ã|a)'], ## e.g São Paulo
51
+ ['ä', '(ä|ae)'], ## e.g.
52
+ ['ç', '(ç|c)'], ## e.g. Fenerbahçe
53
+ ['é', '(é|e)'], ## e.g. Vélez
54
+ ['ê', '(ê|e)'], ## e.g. Grêmio
55
+ ['ñ', '(ñ|n)'], ## e.g. Porteño
56
+ ['ň', '(ň|n)'], ## e.g. Plzeň
57
+ ['Ö', '(Ö|Oe)'], ## e.g. Österreich
58
+ ['ö', '(ö|oe)'], ## e.g. Mönchengladbach
59
+ ['ó', '(ó|o)'], ## e.g. Colón
60
+ ['ș', '(ș|s)'], ## e.g. Bucarești
61
+ ['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
62
+ ['ü', '(ü|ue)'], ## e.g.
63
+ ['ú', '(ú|u)'] ## e.g. Fútbol
64
+ ]
65
+
66
+ alternatives.each do |alt|
67
+ title = title.gsub( alt[0], alt[1] )
68
+ end
69
+
70
+ title
71
+ end
@@ -0,0 +1,7 @@
1
+
2
+ module TextUtils
3
+
4
+ VERSION = '0.3.0'
5
+
6
+ end # module TextUtils
7
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 2
8
+ - 3
9
9
  - 0
10
- version: 0.2.0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Gerald Bauer
@@ -15,12 +15,28 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-06-09 00:00:00 Z
18
+ date: 2013-02-20 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
- name: rdoc
21
+ name: logutils
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 23
29
+ segments:
30
+ - 0
31
+ - 2
32
+ - 0
33
+ version: 0.2.0
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: rdoc
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
24
40
  none: false
25
41
  requirements:
26
42
  - - ~>
@@ -31,22 +47,22 @@ dependencies:
31
47
  - 10
32
48
  version: "3.10"
33
49
  type: :development
34
- version_requirements: *id001
50
+ version_requirements: *id002
35
51
  - !ruby/object:Gem::Dependency
36
52
  name: hoe
37
53
  prerelease: false
38
- requirement: &id002 !ruby/object:Gem::Requirement
54
+ requirement: &id003 !ruby/object:Gem::Requirement
39
55
  none: false
40
56
  requirements:
41
57
  - - ~>
42
58
  - !ruby/object:Gem::Version
43
- hash: 7
59
+ hash: 1
44
60
  segments:
45
61
  - 3
46
- - 0
47
- version: "3.0"
62
+ - 3
63
+ version: "3.3"
48
64
  type: :development
49
- version_requirements: *id002
65
+ version_requirements: *id003
50
66
  description: textutils - Text Filters and Helpers
51
67
  email: webslideshow@googlegroups.com
52
68
  executables: []
@@ -65,9 +81,15 @@ files:
65
81
  - lib/textutils/filter/comment_filter.rb
66
82
  - lib/textutils/filter/erb_django_filter.rb
67
83
  - lib/textutils/filter/erb_filter.rb
84
+ - lib/textutils/reader/code_reader.rb
85
+ - lib/textutils/reader/hash_reader.rb
86
+ - lib/textutils/reader/line_reader.rb
87
+ - lib/textutils/reader/values_reader.rb
88
+ - lib/textutils/utils.rb
89
+ - lib/textutils/version.rb
68
90
  homepage: http://geraldb.github.com/textutils
69
- licenses: []
70
-
91
+ licenses:
92
+ - Public Domain
71
93
  post_install_message:
72
94
  rdoc_options:
73
95
  - --main
@@ -79,10 +101,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
101
  requirements:
80
102
  - - ">="
81
103
  - !ruby/object:Gem::Version
82
- hash: 3
104
+ hash: 55
83
105
  segments:
84
- - 0
85
- version: "0"
106
+ - 1
107
+ - 9
108
+ - 2
109
+ version: 1.9.2
86
110
  required_rubygems_version: !ruby/object:Gem::Requirement
87
111
  none: false
88
112
  requirements: