textutils 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,3 +7,9 @@ lib/textutils/filter/code_filter.rb
7
7
  lib/textutils/filter/comment_filter.rb
8
8
  lib/textutils/filter/erb_django_filter.rb
9
9
  lib/textutils/filter/erb_filter.rb
10
+ lib/textutils/reader/code_reader.rb
11
+ lib/textutils/reader/hash_reader.rb
12
+ lib/textutils/reader/line_reader.rb
13
+ lib/textutils/reader/values_reader.rb
14
+ lib/textutils/utils.rb
15
+ lib/textutils/version.rb
data/Rakefile CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'hoe'
2
- require './lib/textutils.rb'
2
+ require './lib/textutils/version.rb'
3
3
 
4
4
  Hoe.spec 'textutils' do
5
5
 
@@ -16,5 +16,16 @@ Hoe.spec 'textutils' do
16
16
  # switch extension to .markdown for gihub formatting
17
17
  self.readme_file = 'README.markdown'
18
18
  self.history_file = 'History.markdown'
19
+
20
+ self.extra_deps = [
21
+ ['logutils', '~> 0.2.0']
22
+ ]
23
+
24
+ self.licenses = ['Public Domain']
25
+
26
+ self.spec_extras = {
27
+ :required_ruby_version => '>= 1.9.2'
28
+ }
19
29
 
30
+
20
31
  end
@@ -8,17 +8,23 @@ require 'optparse'
8
8
  require 'fileutils'
9
9
  require 'erb'
10
10
 
11
+ # 3rd party gems / libs
12
+
13
+ require 'logutils'
11
14
 
12
15
  # our own code
13
16
 
17
+ require 'textutils/version'
18
+
14
19
  require 'textutils/filter/code_filter'
15
20
  require 'textutils/filter/comment_filter'
16
21
  require 'textutils/filter/erb_django_filter'
17
22
  require 'textutils/filter/erb_filter'
18
23
 
24
+ require 'textutils/utils'
25
+ require 'textutils/reader/code_reader'
26
+ require 'textutils/reader/hash_reader'
27
+ require 'textutils/reader/line_reader'
28
+ require 'textutils/reader/values_reader'
19
29
 
20
- module TextUtils
21
-
22
- VERSION = '0.2.0'
23
30
 
24
- end # module TextUtils
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ class CodeReader
4
+
5
+ def initialize( logger=nil, path )
6
+ if logger.nil?
7
+ @logger = Logger.new(STDOUT)
8
+ @logger.level = Logger::INFO
9
+ else
10
+ @logger = logger
11
+ end
12
+
13
+ @path = path
14
+
15
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
16
+ ## - see worlddb/utils.rb
17
+
18
+ @code = File.read_utf8( @path )
19
+ end
20
+
21
+ def eval( klass )
22
+ klass.class_eval( @code )
23
+
24
+ # NB: same as
25
+ #
26
+ # module WorldDB
27
+ # include WorldDB::Models
28
+ # <code here>
29
+ # end
30
+ end
31
+
32
+ attr_reader :logger
33
+
34
+ end # class CodeReader
@@ -0,0 +1,109 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class HashReader
5
+
6
+ def initialize( logger=nil, path )
7
+ if logger.nil?
8
+ @logger = Logger.new(STDOUT)
9
+ @logger.level = Logger::INFO
10
+ else
11
+ @logger = logger
12
+ end
13
+
14
+ @path = path
15
+
16
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
17
+ ## - see worlddb/utils.rb
18
+
19
+ text = File.read_utf8( @path )
20
+
21
+ ### hack for syck yaml parser (e.g.ruby 1.9.2) (cannot handle !!null)
22
+ ## change it to !null to get plain nil
23
+ ## w/ both syck and psych/libyml
24
+
25
+ text = text.gsub( '!!null', '!null' )
26
+
27
+ ### hacks for yaml
28
+
29
+ ### see yaml gotschas
30
+ ## - http://www.perlmonks.org/?node_id=738671
31
+ ## -
32
+
33
+ ## replace all tabs w/ two spaces and issue a warning
34
+ ## nb: yaml does NOT support tabs see why here -> yaml.org/faq.html
35
+
36
+ text = text.gsub( "\t" ) do |_|
37
+ puts "*** warn: hash reader - found tab (\t) replacing w/ two spaces; yaml forbids tabs; see yaml.org/faq.html"
38
+ ' ' # replace w/ two spaces
39
+ end
40
+
41
+ ## quote implicit boolean types on,no,n,y
42
+
43
+ ## nb: escape only if key e.g. no: or "free standing" value on its own line e.g.
44
+ ## no: no
45
+
46
+ text = text.gsub( /^([ ]*)(ON|On|on|NO|No|no|N|n|Y|y)[ ]*:/ ) do |value|
47
+ puts "*** warn: hash reader - found implicit bool (#{$1}#{$2}) for key; adding quotes to turn into string; see yaml.org/refcard.html"
48
+ # nb: preserve leading spaces for structure - might be significant
49
+ "#{$1}'#{$2}':" # add quotes to turn it into a string (not bool e.g. true|false)
50
+ end
51
+
52
+ ## nb: value must be freestanding (only allow optional eol comment)
53
+ ## do not escape if part of string sequence e.g.
54
+ ## key: nb,nn,no,se => nb,nn,'no',se -- avoid!!
55
+
56
+ text = text.gsub( /:[ ]+(ON|On|on|NO|No|no|N|n|Y|y)[ ]*($| #.*$)/ ) do |value|
57
+ puts "*** warn: hash reader - found implicit bool (#{$1}) for value; adding quotes to turn into string; see yaml.org/refcard.html"
58
+ ": '#{$1}'" # add quotes to turn it into a string (not bool e.g. true|false)
59
+ end
60
+
61
+
62
+ @hash = YAML.load( text )
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+
68
+ ###
69
+ # nb: returns all values as strings
70
+ #
71
+
72
+ def each
73
+ @hash.each do |key_wild, value_wild|
74
+ # normalize
75
+ # - key n value as string (not symbols, bool? int? array?)
76
+ # - remove leading and trailing whitespace
77
+ key = key_wild.to_s.strip
78
+ value = value_wild.to_s.strip
79
+
80
+ puts "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
81
+
82
+ yield( key, value )
83
+ end
84
+ end # method each
85
+
86
+ ###
87
+ # todo: what name to use: each_object or each_typed ???
88
+ # or use new TypedHashReader class or similar??
89
+
90
+ def each_typed
91
+ @hash.each do |key_wild, value_wild|
92
+ # normalize
93
+ # - key n value as string (not symbols, bool? int? array?)
94
+ # - remove leading and trailing whitespace
95
+ key = key_wild.to_s.strip
96
+
97
+ if value_wild.is_a?( String )
98
+ value = value_wild.strip
99
+ else
100
+ value = value_wild
101
+ end
102
+
103
+ puts "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
104
+
105
+ yield( key, value )
106
+ end
107
+ end # method each
108
+
109
+ end # class HashReader
@@ -0,0 +1,90 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ ## fix/todo: move to/merge into LineReader itself
5
+ # e.g. use fromString c'tor ??? or similar??
6
+
7
+ class StringLineReader
8
+
9
+ def initialize( logger=nil, data )
10
+ if logger.nil?
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.level = Logger::INFO
13
+ else
14
+ @logger = logger
15
+ end
16
+
17
+ @data = data
18
+ end
19
+
20
+ attr_reader :logger
21
+
22
+
23
+ def each_line
24
+ @data.each_line do |line|
25
+
26
+ if line =~ /^\s*#/
27
+ # skip komments and do NOT copy to result (keep comments secret!)
28
+ logger.debug 'skipping comment line'
29
+ next
30
+ end
31
+
32
+ if line =~ /^\s*$/
33
+ # kommentar oder leerzeile überspringen
34
+ logger.debug 'skipping blank line'
35
+ next
36
+ end
37
+
38
+ # remove leading and trailing whitespace
39
+ line = line.strip
40
+
41
+ yield( line )
42
+ end # each lines
43
+ end # method each_line
44
+
45
+ end
46
+
47
+
48
+ class LineReader
49
+
50
+ def initialize( logger=nil, path )
51
+ if logger.nil?
52
+ @logger = Logger.new(STDOUT)
53
+ @logger.level = Logger::INFO
54
+ else
55
+ @logger = logger
56
+ end
57
+
58
+ @path = path
59
+
60
+ ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
61
+ ## - see worlddb/utils.rb
62
+ @data = File.read_utf8( @path )
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+
68
+ def each_line
69
+ @data.each_line do |line|
70
+
71
+ if line =~ /^\s*#/
72
+ # skip komments and do NOT copy to result (keep comments secret!)
73
+ logger.debug 'skipping comment line'
74
+ next
75
+ end
76
+
77
+ if line =~ /^\s*$/
78
+ # kommentar oder leerzeile überspringen
79
+ logger.debug 'skipping blank line'
80
+ next
81
+ end
82
+
83
+ # remove leading and trailing whitespace
84
+ line = line.strip
85
+
86
+ yield( line )
87
+ end # each lines
88
+ end # method each_line
89
+
90
+ end # class LineReader
@@ -0,0 +1,175 @@
1
+ # encoding: utf-8
2
+
3
+ class ValuesReader
4
+
5
+ def initialize( logger, path, more_values={} )
6
+ ## todo: check - can we make logger=nil a default arg too?
7
+ if logger.nil?
8
+ @logger = Logger.new(STDOUT)
9
+ @logger.level = Logger::INFO
10
+ else
11
+ @logger = logger
12
+ end
13
+
14
+ @path = path
15
+
16
+ @more_values = more_values
17
+
18
+ @data = File.read_utf8( @path )
19
+ end
20
+
21
+ attr_reader :logger
22
+
23
+ def each_line
24
+
25
+ @data.each_line do |line|
26
+
27
+ if line =~ /^\s*#/
28
+ # skip komments and do NOT copy to result (keep comments secret!)
29
+ logger.debug 'skipping comment line'
30
+ next
31
+ end
32
+
33
+ if line =~ /^\s*$/
34
+ # kommentar oder leerzeile überspringen
35
+ logger.debug 'skipping blank line'
36
+ next
37
+ end
38
+
39
+
40
+ # pass 1) remove possible trailing eol comment
41
+ ## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
42
+ ## becomes -> nyc, New York
43
+
44
+ line = line.sub( /\s+#.+$/, '' )
45
+
46
+ # pass 2) remove leading and trailing whitespace
47
+
48
+ line = line.strip
49
+
50
+ puts "line: >>#{line}<<"
51
+
52
+ values = line.split(',')
53
+
54
+ # pass 1) remove leading and trailing whitespace for values
55
+
56
+ values = values.map { |value| value.strip }
57
+
58
+ ##### todo remove support of comment column? (NB: must NOT include commas)
59
+ # pass 2) remove comment columns
60
+
61
+ values = values.select do |value|
62
+ if value =~ /^#/ ## start with # treat it as a comment column; e.g. remove it
63
+ puts " removing column with value >>#{value}<<"
64
+ false
65
+ else
66
+ true
67
+ end
68
+ end
69
+
70
+ puts " values: >>#{values.join('<< >>')}<<"
71
+
72
+
73
+ ### todo/fix: allow check - do NOT allow mixed use of with key and w/o key
74
+ ## either use keys or do NOT use keys; do NOT mix in a single fixture file
75
+
76
+
77
+ ### support autogenerate key from first title value
78
+
79
+ # if it looks like a key (only a-z lower case allowed); assume it's a key
80
+ # - also allow . in keys e.g. world.quali.america, at.cup, etc.
81
+
82
+ if values[0] =~ /^[a-z][a-z.]*[a-z]$/ # NB: minimum two a-z letters required
83
+ key_col = values[0]
84
+ title_col = values[1]
85
+ more_cols = values[2..-1]
86
+ else
87
+ key_col = '<auto>'
88
+ title_col = values[0]
89
+ more_cols = values[1..-1]
90
+ end
91
+
92
+ attribs = {}
93
+
94
+ ## title (split of optional synonyms)
95
+ # e.g. FC Bayern Muenchen|Bayern Muenchen|Bayern
96
+ titles = title_col.split('|')
97
+
98
+ attribs[ :title ] = titles[0]
99
+
100
+ ## add optional synonyms if present
101
+ attribs[ :synonyms ] = titles[1..-1].join('|') if titles.size > 1
102
+
103
+ if key_col == '<auto>'
104
+ ## autogenerate key from first title
105
+ key_col = title_to_key( titles[0] )
106
+ puts " autogen key >#{key_col}< from title >#{titles[0]}<"
107
+ end
108
+
109
+ attribs[ :key ] = key_col
110
+
111
+ attribs = attribs.merge( @more_values ) # e.g. merge country_id and other defaults if present
112
+
113
+ yield( attribs, more_cols )
114
+
115
+ end # each lines
116
+
117
+ end # method each_line
118
+
119
+
120
+
121
+ def title_to_key( title )
122
+
123
+ ## NB: downcase does NOT work for accented chars (thus, include in alternatives)
124
+ key = title.downcase
125
+
126
+ ### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
127
+ key = key.gsub( /\[.+\]/, '' )
128
+
129
+ ## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
130
+ key = key.gsub( /\(.+\)/, '' )
131
+
132
+ ## remove all whitespace and punctuation
133
+ key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
134
+
135
+ ## turn accented char into ascii look alike if possible
136
+ ##
137
+ ## todo: add some more
138
+ ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
139
+
140
+ alternatives = [
141
+ ['ß', 'ss'],
142
+ ['æ', 'ae'],
143
+ ['ä', 'ae'],
144
+ ['á', 'a' ], # e.g. Bogotá, Králové
145
+ ['ã', 'a' ], # e.g São Paulo
146
+ ['ă', 'a' ], # e.g. Chișinău
147
+ ['é', 'e' ], # e.g. Vélez, Králové
148
+ ['è', 'e' ], # e.g. Rivières
149
+ ['ê', 'e' ], # e.g. Grêmio
150
+ ['ě', 'e' ], # e.g. Budějovice
151
+ ['ì', 'i' ], # e.g. Potosì
152
+ ['í', 'i' ], # e.g. Ústí
153
+ ['ñ', 'n' ], # e.g. Porteño
154
+ ['ň', 'n' ], # e.g. Plzeň, Třeboň
155
+ ['ö', 'oe'],
156
+ ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
157
+ ['ř', 'r' ], # e.g. Třeboň
158
+ ['ș', 's' ], # e.g. Chișinău
159
+ ['ü', 'ue'],
160
+ ['ú', 'u' ], # e.g. Fútbol
161
+ ['ź', 'z' ], # e.g. Łódź
162
+ ['Č', 'c' ], # e.g. České
163
+ ['Ł', 'l' ], # e.g. Łódź
164
+ ['Ú', 'u' ], # e.g. Ústí
165
+ ]
166
+
167
+ alternatives.each do |alt|
168
+ key = key.gsub( alt[0], alt[1] )
169
+ end
170
+
171
+ key
172
+ end # method title_to_key
173
+
174
+
175
+ end # class ValuesReader
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ class File
5
+ def self.read_utf8( path )
6
+ text = open( path, 'r:bom|utf-8' ) do |file|
7
+ file.read
8
+ end
9
+ ### for convenience for now convert fancy dash to "plain" dash
10
+ ## todo: check char codes for "fancy" dash alternatives
11
+ text.gsub!( /—|–/ ) do |_|
12
+ puts "*** warning: convert fancy dash to 'plain' dash in file >#{path}<"
13
+ ### exit 1 #### do NOT tolerate!! cleanup dashes for now
14
+ '-'
15
+ end
16
+ text
17
+ end
18
+ end # class File
19
+
20
+
21
+ ############
22
+ ### fix/todo: share helper for all text readers/parsers- where to put it?
23
+ ###
24
+
25
+ def title_esc_regex( title_unescaped )
26
+
27
+ ## escape regex special chars e.g. . to \. and ( to \( etc.
28
+ # e.g. Benfica Lis.
29
+ # e.g. Club Atlético Colón (Santa Fe)
30
+
31
+ ## NB: cannot use Regexp.escape! will escape space '' to '\ '
32
+ ## title = Regexp.escape( title_unescaped )
33
+ title = title_unescaped.gsub( '.', '\.' )
34
+ title = title.gsub( '(', '\(' )
35
+ title = title.gsub( ')', '\)' )
36
+
37
+ ## match accented char with or without accents
38
+ ## add (ü|ue) etc.
39
+ ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
40
+
41
+ ## todo: add some more
42
+ ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
43
+ ##
44
+ ## reuse for all readers!
45
+
46
+ alternatives = [
47
+ ['-', '(-| )'], ## e.g. Blau-Weiß Linz
48
+ ['æ', '(æ|ae)'], ## e.g.
49
+ ['á', '(á|a)'], ## e.g. Bogotá, Sársfield
50
+ ['ã', '(ã|a)'], ## e.g São Paulo
51
+ ['ä', '(ä|ae)'], ## e.g.
52
+ ['ç', '(ç|c)'], ## e.g. Fenerbahçe
53
+ ['é', '(é|e)'], ## e.g. Vélez
54
+ ['ê', '(ê|e)'], ## e.g. Grêmio
55
+ ['ñ', '(ñ|n)'], ## e.g. Porteño
56
+ ['ň', '(ň|n)'], ## e.g. Plzeň
57
+ ['Ö', '(Ö|Oe)'], ## e.g. Österreich
58
+ ['ö', '(ö|oe)'], ## e.g. Mönchengladbach
59
+ ['ó', '(ó|o)'], ## e.g. Colón
60
+ ['ș', '(ș|s)'], ## e.g. Bucarești
61
+ ['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
62
+ ['ü', '(ü|ue)'], ## e.g.
63
+ ['ú', '(ú|u)'] ## e.g. Fútbol
64
+ ]
65
+
66
+ alternatives.each do |alt|
67
+ title = title.gsub( alt[0], alt[1] )
68
+ end
69
+
70
+ title
71
+ end
@@ -0,0 +1,7 @@
1
+
2
+ module TextUtils
3
+
4
+ VERSION = '0.3.0'
5
+
6
+ end # module TextUtils
7
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 2
8
+ - 3
9
9
  - 0
10
- version: 0.2.0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Gerald Bauer
@@ -15,12 +15,28 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-06-09 00:00:00 Z
18
+ date: 2013-02-20 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
- name: rdoc
21
+ name: logutils
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 23
29
+ segments:
30
+ - 0
31
+ - 2
32
+ - 0
33
+ version: 0.2.0
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: rdoc
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
24
40
  none: false
25
41
  requirements:
26
42
  - - ~>
@@ -31,22 +47,22 @@ dependencies:
31
47
  - 10
32
48
  version: "3.10"
33
49
  type: :development
34
- version_requirements: *id001
50
+ version_requirements: *id002
35
51
  - !ruby/object:Gem::Dependency
36
52
  name: hoe
37
53
  prerelease: false
38
- requirement: &id002 !ruby/object:Gem::Requirement
54
+ requirement: &id003 !ruby/object:Gem::Requirement
39
55
  none: false
40
56
  requirements:
41
57
  - - ~>
42
58
  - !ruby/object:Gem::Version
43
- hash: 7
59
+ hash: 1
44
60
  segments:
45
61
  - 3
46
- - 0
47
- version: "3.0"
62
+ - 3
63
+ version: "3.3"
48
64
  type: :development
49
- version_requirements: *id002
65
+ version_requirements: *id003
50
66
  description: textutils - Text Filters and Helpers
51
67
  email: webslideshow@googlegroups.com
52
68
  executables: []
@@ -65,9 +81,15 @@ files:
65
81
  - lib/textutils/filter/comment_filter.rb
66
82
  - lib/textutils/filter/erb_django_filter.rb
67
83
  - lib/textutils/filter/erb_filter.rb
84
+ - lib/textutils/reader/code_reader.rb
85
+ - lib/textutils/reader/hash_reader.rb
86
+ - lib/textutils/reader/line_reader.rb
87
+ - lib/textutils/reader/values_reader.rb
88
+ - lib/textutils/utils.rb
89
+ - lib/textutils/version.rb
68
90
  homepage: http://geraldb.github.com/textutils
69
- licenses: []
70
-
91
+ licenses:
92
+ - Public Domain
71
93
  post_install_message:
72
94
  rdoc_options:
73
95
  - --main
@@ -79,10 +101,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
101
  requirements:
80
102
  - - ">="
81
103
  - !ruby/object:Gem::Version
82
- hash: 3
104
+ hash: 55
83
105
  segments:
84
- - 0
85
- version: "0"
106
+ - 1
107
+ - 9
108
+ - 2
109
+ version: 1.9.2
86
110
  required_rubygems_version: !ruby/object:Gem::Requirement
87
111
  none: false
88
112
  requirements: