textutils 0.8.5 → 0.8.6

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -24,6 +24,7 @@ lib/textutils/reader/hash_reader.rb
24
24
  lib/textutils/reader/line_reader.rb
25
25
  lib/textutils/reader/values_reader.rb
26
26
  lib/textutils/sanitizier.rb
27
+ lib/textutils/title.rb
27
28
  lib/textutils/utils.rb
28
29
  lib/textutils/version.rb
29
30
  test/helper.rb
@@ -0,0 +1,152 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ ## todo: rename to TitleFinder or TitleMapper ??
5
+ # other options TitleMatcher?
6
+ # TitleMapping? TitleMappings?
7
+ # or rename to KeyMapping?, KeyMapper?, KeyTable? etc.
8
+
9
+
10
+ module TextUtils
11
+ module TitleTable
12
+
13
+ ####
14
+ ## fix: turn it into a class w/ methods
15
+ #
16
+ #e.g t =TitleTable.new( records, name ) # e.g. name='team'
17
+ # t.map!( line )
18
+ # t.find_key!( line )
19
+ # etc.
20
+
21
+
22
+ def build_title_table_for( records )
23
+ ## build known tracks table w/ synonyms e.g.
24
+ #
25
+ # [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]],
26
+ # [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]],
27
+ # [ 'stuttgart', [ 'VfB Stuttgart' ]] ]
28
+
29
+ known_titles = []
30
+
31
+ records.each_with_index do |rec,index|
32
+
33
+ title_candidates = []
34
+ title_candidates << rec.title
35
+
36
+ title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
37
+
38
+
39
+ ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
40
+ # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
41
+
42
+ titles = []
43
+ title_candidates.each do |t|
44
+ titles << t
45
+ if t =~ /\(.+\)/
46
+ extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
47
+ extra_title.strip! # strip leading n trailing withspaces too!
48
+ titles << extra_title
49
+ end
50
+ end
51
+
52
+
53
+ ## NB: sort here by length (largest goes first - best match)
54
+ # exclude code and key (key should always go last)
55
+ titles = titles.sort { |left,right| right.length <=> left.length }
56
+
57
+ ## escape for regex plus allow subs for special chars/accents
58
+ titles = titles.map { |title| TextUtils.title_esc_regex( title ) }
59
+
60
+ ## NB: only include code field - if defined
61
+ titles << rec.code if rec.respond_to?(:code) && rec.code.present?
62
+
63
+ known_titles << [ rec.key, titles ]
64
+
65
+ ### fix: use plain logger
66
+ LogUtils::Logger.root.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
67
+ end
68
+
69
+ known_titles
70
+ end
71
+
72
+
73
+
74
+ def find_key_for!( name, line )
75
+ regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
76
+
77
+ upcase_name = name.upcase
78
+ downcase_name = name.downcase
79
+
80
+ if line =~ regex
81
+ value = "#{$1}"
82
+ ### fix: use plain logger
83
+ LogUtils::Logger.root.debug " #{downcase_name}: >#{value}<"
84
+
85
+ line.sub!( regex, "[#{upcase_name}]" )
86
+
87
+ return $1
88
+ else
89
+ return nil
90
+ end
91
+ end
92
+
93
+
94
+ def find_keys_for!( name, line ) # NB: keys (plural!) - will return array
95
+ counter = 1
96
+ keys = []
97
+
98
+ downcase_name = name.downcase
99
+
100
+ key = find_key_for!( "#{downcase_name}#{counter}", line )
101
+ while key.present?
102
+ keys << key
103
+ counter += 1
104
+ key = find_key_for!( "#{downcase_name}#{counter}", line )
105
+ end
106
+
107
+ keys
108
+ end
109
+
110
+
111
+ def map_titles_for!( name, line, title_table )
112
+ title_table.each do |rec|
113
+ key = rec[0]
114
+ values = rec[1]
115
+ map_title_worker_for!( name, line, key, values )
116
+ end
117
+ end
118
+
119
+
120
+ def map_title_worker_for!( name, line, key, values )
121
+
122
+ downcase_name = name.downcase
123
+
124
+ values.each do |value|
125
+ ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
126
+ ## (thus add it, allows match for Benfica Lis. for example - note . at the end)
127
+
128
+ ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
129
+ regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
130
+ if line =~ regex
131
+ ### fix: use plain logger
132
+ LogUtils::Logger.root.debug " match for #{downcase_name} >#{key}< >#{value}<"
133
+ # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
134
+ line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
135
+ return true # break out after first match (do NOT continue)
136
+ end
137
+ end
138
+ return false
139
+ end
140
+
141
+ end # module TitleTable
142
+ end # module TextUtils
143
+
144
+
145
+ ## auto-include methods
146
+
147
+ module TextUtils
148
+ # make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
149
+ extend TitleTable # lets us use TextUtils.build_title_table_for etc.
150
+ end
151
+
152
+
@@ -1,8 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.8.5'
4
+ VERSION = '0.8.6'
5
5
 
6
6
  end # module TextUtils
7
-
8
-
data/lib/textutils.rb CHANGED
@@ -44,6 +44,7 @@ require 'textutils/reader/values_reader'
44
44
  require 'textutils/reader/fixture_reader'
45
45
 
46
46
  require 'textutils/classifier'
47
+ require 'textutils/title' # title table/mapper/finder utils
47
48
 
48
49
  require 'textutils/page' # for book pages and page templates
49
50
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.8.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-02-15 00:00:00.000000000 Z
12
+ date: 2014-02-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &21250380 !ruby/object:Gem::Requirement
16
+ requirement: &20840748 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *21250380
24
+ version_requirements: *20840748
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &21250032 !ruby/object:Gem::Requirement
27
+ requirement: &20840412 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *21250032
35
+ version_requirements: *20840412
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &21249564 !ruby/object:Gem::Requirement
38
+ requirement: &20839944 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *21249564
46
+ version_requirements: *20839944
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
@@ -79,6 +79,7 @@ files:
79
79
  - lib/textutils/reader/line_reader.rb
80
80
  - lib/textutils/reader/values_reader.rb
81
81
  - lib/textutils/sanitizier.rb
82
+ - lib/textutils/title.rb
82
83
  - lib/textutils/utils.rb
83
84
  - lib/textutils/version.rb
84
85
  - test/helper.rb