textutils 0.8.5 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -24,6 +24,7 @@ lib/textutils/reader/hash_reader.rb
24
24
  lib/textutils/reader/line_reader.rb
25
25
  lib/textutils/reader/values_reader.rb
26
26
  lib/textutils/sanitizier.rb
27
+ lib/textutils/title.rb
27
28
  lib/textutils/utils.rb
28
29
  lib/textutils/version.rb
29
30
  test/helper.rb
@@ -0,0 +1,152 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ ## todo: rename to TitleFinder or TitleMapper ??
5
+ # other options TitleMatcher?
6
+ # TitleMapping? TitleMappings?
7
+ # or rename to KeyMapping?, KeyMapper?, KeyTable? etc.
8
+
9
+
10
+ module TextUtils
11
+ module TitleTable
12
+
13
+ ####
14
+ ## fix: turn it into a class w/ methods
15
+ #
16
+ #e.g t =TitleTable.new( records, name ) # e.g. name='team'
17
+ # t.map!( line )
18
+ # t.find_key!( line )
19
+ # etc.
20
+
21
+
22
+ def build_title_table_for( records )
23
+ ## build known tracks table w/ synonyms e.g.
24
+ #
25
+ # [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]],
26
+ # [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]],
27
+ # [ 'stuttgart', [ 'VfB Stuttgart' ]] ]
28
+
29
+ known_titles = []
30
+
31
+ records.each_with_index do |rec,index|
32
+
33
+ title_candidates = []
34
+ title_candidates << rec.title
35
+
36
+ title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
37
+
38
+
39
+ ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
40
+ # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
41
+
42
+ titles = []
43
+ title_candidates.each do |t|
44
+ titles << t
45
+ if t =~ /\(.+\)/
46
+ extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
47
+ extra_title.strip! # strip leading n trailing withspaces too!
48
+ titles << extra_title
49
+ end
50
+ end
51
+
52
+
53
+ ## NB: sort here by length (largest goes first - best match)
54
+ # exclude code and key (key should always go last)
55
+ titles = titles.sort { |left,right| right.length <=> left.length }
56
+
57
+ ## escape for regex plus allow subs for special chars/accents
58
+ titles = titles.map { |title| TextUtils.title_esc_regex( title ) }
59
+
60
+ ## NB: only include code field - if defined
61
+ titles << rec.code if rec.respond_to?(:code) && rec.code.present?
62
+
63
+ known_titles << [ rec.key, titles ]
64
+
65
+ ### fix: use plain logger
66
+ LogUtils::Logger.root.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
67
+ end
68
+
69
+ known_titles
70
+ end
71
+
72
+
73
+
74
+ def find_key_for!( name, line )
75
+ regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
76
+
77
+ upcase_name = name.upcase
78
+ downcase_name = name.downcase
79
+
80
+ if line =~ regex
81
+ value = "#{$1}"
82
+ ### fix: use plain logger
83
+ LogUtils::Logger.root.debug " #{downcase_name}: >#{value}<"
84
+
85
+ line.sub!( regex, "[#{upcase_name}]" )
86
+
87
+ return $1
88
+ else
89
+ return nil
90
+ end
91
+ end
92
+
93
+
94
+ def find_keys_for!( name, line ) # NB: keys (plural!) - will return array
95
+ counter = 1
96
+ keys = []
97
+
98
+ downcase_name = name.downcase
99
+
100
+ key = find_key_for!( "#{downcase_name}#{counter}", line )
101
+ while key.present?
102
+ keys << key
103
+ counter += 1
104
+ key = find_key_for!( "#{downcase_name}#{counter}", line )
105
+ end
106
+
107
+ keys
108
+ end
109
+
110
+
111
+ def map_titles_for!( name, line, title_table )
112
+ title_table.each do |rec|
113
+ key = rec[0]
114
+ values = rec[1]
115
+ map_title_worker_for!( name, line, key, values )
116
+ end
117
+ end
118
+
119
+
120
+ def map_title_worker_for!( name, line, key, values )
121
+
122
+ downcase_name = name.downcase
123
+
124
+ values.each do |value|
125
+ ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
126
+ ## (thus add it, allows match for Benfica Lis. for example - note . at the end)
127
+
128
+ ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
129
+ regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
130
+ if line =~ regex
131
+ ### fix: use plain logger
132
+ LogUtils::Logger.root.debug " match for #{downcase_name} >#{key}< >#{value}<"
133
+ # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
134
+ line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
135
+ return true # break out after first match (do NOT continue)
136
+ end
137
+ end
138
+ return false
139
+ end
140
+
141
+ end # module TitleTable
142
+ end # module TextUtils
143
+
144
+
145
+ ## auto-include methods
146
+
147
+ module TextUtils
148
+ # make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
149
+ extend TitleTable # lets us use TextUtils.build_title_table_for etc.
150
+ end
151
+
152
+
@@ -1,8 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.8.5'
4
+ VERSION = '0.8.6'
5
5
 
6
6
  end # module TextUtils
7
-
8
-
data/lib/textutils.rb CHANGED
@@ -44,6 +44,7 @@ require 'textutils/reader/values_reader'
44
44
  require 'textutils/reader/fixture_reader'
45
45
 
46
46
  require 'textutils/classifier'
47
+ require 'textutils/title' # title table/mapper/finder utils
47
48
 
48
49
  require 'textutils/page' # for book pages and page templates
49
50
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.8.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-02-15 00:00:00.000000000 Z
12
+ date: 2014-02-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &21250380 !ruby/object:Gem::Requirement
16
+ requirement: &20840748 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *21250380
24
+ version_requirements: *20840748
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &21250032 !ruby/object:Gem::Requirement
27
+ requirement: &20840412 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *21250032
35
+ version_requirements: *20840412
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &21249564 !ruby/object:Gem::Requirement
38
+ requirement: &20839944 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *21249564
46
+ version_requirements: *20839944
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
@@ -79,6 +79,7 @@ files:
79
79
  - lib/textutils/reader/line_reader.rb
80
80
  - lib/textutils/reader/values_reader.rb
81
81
  - lib/textutils/sanitizier.rb
82
+ - lib/textutils/title.rb
82
83
  - lib/textutils/utils.rb
83
84
  - lib/textutils/version.rb
84
85
  - test/helper.rb