factbook-readers 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
@@ -1,68 +0,0 @@
1
-
2
- module Factbook
3
-
4
- ######
5
- # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
6
-
7
- class JsonBuilder
8
- include LogUtils::Logging
9
- include NormalizeHelper ## e.g. normalize_category
10
-
11
-
12
- attr_reader :text,
13
- :json,
14
- :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
15
- :errors, ## not used yet -- encoding erros etc.
16
- :sects
17
-
18
-
19
- def initialize( text )
20
- @text = text
21
-
22
- @json = JSON.parse( text )
23
-
24
- @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
25
- @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
26
-
27
- @sects = []
28
-
29
- @json.each do |k1,v1|
30
- sect_title = k1
31
- sect_subsects = v1
32
-
33
- sect = Sect.new
34
- sect.title = sect_title
35
-
36
- ## get subsections
37
- subsects = []
38
- sect_subsects.each do |k2,v2|
39
- subsect_title = k2
40
- subsect_data = v2
41
-
42
- subsect = Subsect.new
43
- subsect.title = subsect_title
44
-
45
- #####
46
- ## note: run data hash through normalize_category (again)
47
- if subsect_data.is_a?( Hash )
48
- new_subsect_data = {}
49
- subsect_data.each do |k3,v3|
50
- new_subsect_data[ normalize_category(k3) ] = v3
51
- end
52
- subsect_data = new_subsect_data
53
- end
54
-
55
- subsect.data = subsect_data
56
-
57
- subsects << subsect
58
- end
59
-
60
- sect.subsects = subsects
61
- @sects << sect
62
- end
63
- end
64
-
65
- end # class JsonBuilder
66
-
67
-
68
- end # module Factbook
@@ -1,121 +0,0 @@
1
- ##
2
- # note:
3
- # the factbook category/region for world is other entities (on FAQ) and oceans in page
4
- # changed to world
5
-
6
-
7
- module Factbook
8
-
9
- class Codes
10
-
11
- Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
12
- :name,
13
- :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
14
- :region, ## e.g. Europe, Oceans, etc.
15
- )
16
-
17
- def self.read_csv( path )
18
- ###
19
- # note:
20
- # if you use quotes - NO leading spaces allowed e.g.
21
- # use au,"Austria",... and NOT
22
- # au, "Austria", ...
23
- #
24
- # for headers - NO leading spaces allowed e.g.
25
- # use Code,Name,Category,Region,... and NOT
26
- # Code, Name, Category, Region, ...
27
-
28
- rows = CsvHash.read( path )
29
-
30
- pp rows
31
-
32
- recs = []
33
- rows.each do |row|
34
- pp row
35
- rec = Code.new
36
- rec.code = row['Code'].strip ## remove leading n trailing whitespaces
37
- rec.name = row['Name'].strip
38
-
39
- ## note: for now category and region are optional
40
- rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
41
- rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
42
-
43
- pp rec
44
- recs << rec
45
- end
46
-
47
- new( recs )
48
- end
49
-
50
-
51
- def initialize( codes )
52
- @codes = codes
53
- end
54
-
55
- def size() @codes.size; end
56
-
57
- def each( &blk ) @codes.each( &blk ); end
58
- def select( &blk )
59
- codes = @codes.select( &blk )
60
- Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
61
- end
62
-
63
-
64
- def to_a
65
- @codes.collect {|code| code.code } ## return array of codes
66
- end
67
-
68
- ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
69
-
70
- ## "pre-defined" convenience shortcuts
71
- def countries() category 'Countries'; end
72
- def world() category 'World'; end
73
- def oceans() category 'Oceans'; end
74
- def misc() category 'Miscellaneous'; end
75
- def others() category 'Other'; end
76
- def dependencies() category 'Dependencies'; end
77
- def dependencies_us() category 'Dependencies (United States)'; end
78
- ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
79
-
80
- def europe() region 'Europe'; end
81
- def south_asia() region 'South Asia'; end
82
- def central_asia() region 'Central Asia'; end
83
- def east_n_souteast_asia() region 'East & Southeast Asia'; end
84
- def middle_east() region 'Middle East'; end
85
- def africa() region 'Africa'; end
86
- def north_america() region 'North America'; end
87
- def central_america_n_caribbean() region 'Central America and Caribbean'; end
88
- def south_america() region 'South America'; end
89
- def australia_oceania() region 'Australia-Oceania'; end
90
- def antartica() region 'Antarctica'; end
91
-
92
- ## note: regions oceans and world - same as category oceans and world
93
- ## use oceans_ii or world_ii or something ??
94
- ## use category('World') n region('World')
95
- ## use category('Oceans') n region('Oceans')
96
-
97
-
98
- def category( query )
99
- ## todo/future: allow passing in of regex too (not just string)
100
- ## note: e.g. Dependencies (France) needs to get escpaed to
101
- ## Dependencies \(France\) etc.
102
- filter_regex = /#{Regexp.escape(query)}/i
103
- codes = @codes.select do |code|
104
- code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
105
- end
106
- Codes.new( codes ) ## return new Codes obj for easy-chaining
107
- end
108
-
109
- def region( query )
110
- ## todo/future: allow passing in of regex too (not just string)
111
- filter_regex = /#{Regexp.escape(query)}/i
112
- codes = @codes.select do |code|
113
- code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
114
- end
115
- Codes.new( codes ) ## return new Codes obj for easy-chaining
116
- end
117
-
118
- end # class codes
119
-
120
- end # module Factbook
121
-
@@ -1,49 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Comparisons
5
-
6
- Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
7
- :category, ## e.g. Geography, People, Economy, etc.
8
- :name,
9
- )
10
-
11
- def self.read_csv( path )
12
-
13
- rows = CsvHash.read( path )
14
-
15
- pp rows
16
-
17
- recs = []
18
- rows.each do |row|
19
- pp row
20
- rec = Comparison.new
21
- rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
22
- rec.category = row['Category'].strip
23
- rec.name = row['Name'].strip
24
-
25
- pp rec
26
- recs << rec
27
- end
28
-
29
- new( recs )
30
- end
31
-
32
- def initialize( comps )
33
- @comps = comps
34
- end
35
-
36
- def size() @comps.size; end
37
-
38
- def each
39
- @comps.each {|comp| yield( comp ) }
40
- end
41
-
42
- def to_a
43
- @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
44
- end
45
-
46
- end # class Comparison
47
-
48
- end # module Factbook
49
-
@@ -1,42 +0,0 @@
1
-
2
- module Factbook
3
- module NormalizeHelper
4
-
5
-
6
- def normalize_category( text )
7
-
8
- ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
9
-
10
- text = text.strip
11
- text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
12
- text = text.strip
13
-
14
- #######################################
15
- ### special cases
16
-
17
- ## typos e.g ntoe => use note
18
- text = 'note' if text == 'ntoe'
19
- text = 'investment in fixed capital' if text == 'investment if fixed capital'
20
-
21
- ## downcase
22
- text = 'lowest point' if text == 'Lowest point'
23
- text = 'chief of state' if text == 'Chief of state'
24
-
25
- ## spelling variant (use more popular one)
26
- text = 'signed, but not ratified' if text == 'signed but not ratified'
27
- text = 'vectorborne diseases' if text == 'vectorborne disease'
28
- text = 'water contact disease' if text == 'water contact diseases'
29
- text = 'food or waterborne diseases' if text == 'food or waterborne disease'
30
- text = 'geographic coordinates' if text == 'geographical coordinates'
31
- text = 'note' if text == 'notes'
32
- text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
33
-
34
- ## border countries (8): -- remove (x) counter
35
- text = 'border countries' if text.start_with?( 'border countries')
36
-
37
- text
38
- end
39
-
40
-
41
- end # module NormalizeHelper
42
- end # module Factbook
@@ -1,50 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- class JsonPageReader
6
- def initialize( json_dir )
7
- @json_dir = json_dir
8
- end
9
-
10
- def read_page( code )
11
- path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
12
-
13
- puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
14
- json = File.read( path, 'r:utf-8' ) { |f| f.read }
15
-
16
- ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
17
- # add some page info from code struct
18
-
19
- info = PageInfo.new
20
- info.country_code = code.code
21
- info.country_name = code.name
22
- info.region_name = code.region
23
-
24
- page = Page.new( code.code, json: json, info: info )
25
- page
26
- end
27
-
28
- def read_pages( codes, limit: nil )
29
- pages = []
30
- i=0
31
- codes.each do |code|
32
- next if limit && i > limit ## for debugging just process first x entries
33
-
34
- pages << read_page( code )
35
- end
36
- pages
37
- end
38
-
39
- private
40
- def region_to_slug( text )
41
- ## change and => n
42
- ## change & => n
43
- ## change all spaces to => -
44
- ## e.g. East & Southeast Asia => east-n-southeast-asia
45
- ## Central America and Caribbean => central-america-n-caribbean
46
- text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
47
- end
48
- end ## JsonPageReader
49
-
50
- end # module Factbook
@@ -1,351 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Sanitizer
5
- include LogUtils::Logging
6
- include Utils ## e.g. find_page_info etc.
7
-
8
- def sanitize( html )
9
- ## todo: add option for (html source) encoding - why?? why not??
10
-
11
- ## note:
12
- ## returns 1) html profile withouth headers, footers, scripts,etc.
13
- ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
14
- ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
15
-
16
- page_info = PageInfo.new
17
-
18
- ## todo:
19
- ## make page info optional? why? why not?
20
- ## not always available (if page structure changes) - check
21
- ## what page info is required??
22
- h = find_page_info( html )
23
- if h
24
- page_info.country_code = h[:country_code]
25
- page_info.country_name = h[:country_name]
26
- page_info.country_affiliation = h[:country_affiliation]
27
- page_info.region_code = h[:region_code]
28
- page_info.region_name = h[:region_name]
29
- else
30
- page_info.country_code = find_country_code( html )
31
- ## print/warn: no page info found
32
- end
33
-
34
-
35
- page_info.last_updated = find_page_last_updated( html )
36
-
37
-
38
- html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
39
-
40
- ## todo/check: remove 3rd args old errors array - why? why not?
41
- [html_profile, page_info, []]
42
- end
43
-
44
-
45
-
46
- def find_country_profile( html )
47
- ####
48
- ## remove header (everything before)
49
- ## <ul class="expandcollapse">
50
-
51
- ##
52
- ## fix know broken html bugs
53
- ## in co (Columbia) page (Nov/11 2020):
54
- ## <div class="photogallery_captiontext">
55
- ## <p>slightly less than twice the size of Texas</p
56
- ## </div>
57
- ## note: </p => unclosed p!! change to </p>
58
-
59
- ## note: in regex use negative looakhead e.g. (?!patttern)
60
- html = html.gsub( %r{</p(?![>])} ) do |m|
61
- puts "!! WARN: fixing unclosed </p => </p>"
62
- puts "#{m}"
63
- '</p>'
64
- end
65
-
66
-
67
- doc = Nokogiri::HTML( html )
68
-
69
- ul = doc.css( 'ul.expandcollapse' )[0]
70
-
71
- puts ul.to_html[0..100]
72
-
73
-
74
- ###
75
- ## sanitize
76
- ## remove link items
77
- ## assume two <li>s are a section
78
-
79
- html = String.new('')
80
-
81
- ## filter all li's
82
- ul_children = ul.children.select { |el| if el.name == 'li'
83
- true
84
- else
85
- # puts "skipping #{el.name} >#{el.to_html}<"
86
- false
87
- end
88
- }
89
- ## ul_children = ul.css( 'li' )
90
-
91
- puts " #{ul_children.size} li(s):"
92
- ul_children.each_slice(2) do |lis|
93
- li = lis[0]
94
- div = li.at( 'div[sectiontitle]' )
95
- if div.nil?
96
- puts "!! ERROR: no section title found in div:"
97
- puts li.to_html
98
- exit 1
99
- end
100
-
101
- section_title = div['sectiontitle'].to_s
102
-
103
- html << "<h2>#{section_title}</h2>\n"
104
-
105
-
106
- li = lis[1]
107
- ## filter all div's
108
- li_children = li.children.select { |el| if el.name =='div'
109
- true
110
- else
111
- # puts "skipping #{el.name} >#{el.to_html}<"
112
- false
113
- end
114
- }
115
- puts " #{li_children.size} div(s) in >#{section_title}<:"
116
-
117
-
118
- ## check special case in world Geographic overview:
119
- # <div class="category oce_light" style="padding-left:5px;"
120
- # id="field-anchor-geography-geographic-overview">
121
- # Geographic overview:
122
- # <span class="field-listing-link">
123
- # <a href="../fields/275.html#XX">
124
- # <img alt="Geographic overview field listing"
125
- # title="Geographic overview field listing"
126
- # src="../images/field_listing.gif" /></a>
127
- # </span>
128
- #</div>
129
- # vs regular
130
- #
131
- # <div class="category oce_light" style="padding-left:5px;"
132
- # id="field-anchor-geography-area-comparative">
133
- # <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'>
134
- # <a aria-label="Use this link to access a description of the Area - comparative field"
135
- # href="../docs/notesanddefs.html#280">
136
- # Area - comparative
137
- # </a>:
138
- # <span class="tooltip-content">
139
- # This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres).
140
- # </span>
141
- # </span>
142
- # <span class="field-listing-link">
143
- # <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a>
144
- # </span>
145
- # </div>
146
-
147
- li_children.each_slice(2) do |divs|
148
- div = divs[0]
149
-
150
- ## try new way - try clean-up / rm first
151
- span_tooltip_content = div.at( 'span.tooltip-content' )
152
- if span_tooltip_content
153
- span_tooltip_content.inner_html = ''
154
- span_tooltip_content.replace( '' ) ## check for how to delete/remove - why? why not!!
155
- end
156
-
157
- span_field_listing_link = div.at( 'span.field-listing-link' )
158
- if span_field_listing_link
159
- span_field_listing_link.inner_html = ''
160
- span_field_listing_link.replace( '' )
161
- end
162
-
163
- subsection_title = div.text.strip
164
- html << "\n<h3>#{subsection_title}</h3>\n"
165
-
166
- # a = div.css('a')[0]
167
- # if a
168
- # subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
169
- # html << "\n<h3>#{subsection_title}:</h3>\n"
170
- # else
171
- # subsection_title = '???'
172
- # puts "!! WARN: no anchor found:"
173
- # puts div.to_html
174
- # end
175
-
176
-
177
- div = divs[1]
178
- div_children = div.children.select {|el| el.name == 'div' ? true : false }
179
- puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
180
-
181
- ## use more robust version - only get divs with category_data
182
- ## div_children = div.css( 'div.category_data' )
183
- ## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
184
-
185
- # if div_children.size > 14
186
- # ## us labor force has 11 divs
187
- # ## possibly an error
188
- # puts "!! ERROR - too many category_data divs found:"
189
- # puts div.to_html[0..200]
190
- # puts "\n...\n"
191
- # puts puts div.to_html[-400..-1]
192
- # exit 1
193
- # end
194
-
195
- div_children.each do |catdiv|
196
- if catdiv['class'] && catdiv['class'].index( 'category_data' )
197
-
198
- if catdiv['class'].index( 'attachment' )
199
- ## skip attachments e.g. maps, pop pyramids, etc.
200
- else
201
- html << sanitize_data( catdiv, title: subsection_title )
202
- html << "\n"
203
- end
204
- else
205
- if catdiv.to_html.index( 'country comparison to the world' )
206
- ## simplify/unlinkify country comparision
207
- ## <div>
208
- ## <span class='category'>country comparison to the world:</span>
209
- ## <span class='category_data'>
210
- ## <a href="../fields/335rank.html#AU">97</a>
211
- ## </span>
212
- ## </div>
213
- ## e.g. to =>
214
- ## <div>
215
- ## country comparison to the world: 97
216
- ## </div>
217
- html << "<div>\n #{squish( catdiv.text.strip )}\n</div>"
218
- html << "\n"
219
- else
220
- puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
221
- puts catdiv.to_html
222
- exit 1
223
- end
224
- end
225
- end
226
- end
227
- end
228
-
229
- html
230
- end
231
-
232
-
233
- #
234
- # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
235
- #
236
- # remove aria labels
237
- ARIA_ATTR_RE = /\s*
238
- aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
239
- /xim ## do NOT allow multi-line - why? why not?
240
-
241
- ## find double breaks e.g. <br><br>
242
- BR_BR_RE = /(<br> \s* <br>)
243
- /xim ## do NOT allow multi-line - why? why not?
244
-
245
-
246
- def sanitize_data( el, title: )
247
- ## todo/fix/check:
248
- ## check if more than one p(aragraph)
249
- ## get squezzed together without space inbetween?
250
-
251
-
252
- ## step 0: replace all possible a(nchor) links with just inner text
253
- el.css( 'a').each do |a|
254
- a.replace( " #{a.text.strip} " )
255
- end
256
-
257
-
258
-
259
- inner_html = String.new('')
260
-
261
- ## step 1 - unwrap paragraphs if present
262
- ## and convert dom/nokogiri doc/tree to html string
263
- p_count = 0
264
- el.children.each do |child|
265
- if child.name == 'p'
266
- ## puts " [debug ] unwrap <p> no.#{p_count+1}"
267
-
268
- p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
269
- if p_inner_html.empty?
270
- ## note: skip empty paragraphs for now
271
- else
272
- inner_html << ' ++ ' if p_count > 0
273
- inner_html << p_inner_html
274
- inner_html << " \n\n "
275
-
276
- p_count += 1
277
- end
278
- else
279
- inner_html << child.to_html
280
- end
281
- end
282
- ## note: keep container div!! just replace inner html!!!
283
- ## note: right strip all trailing spaces/newlines for now
284
- ## plus add back a single one for pretty printing
285
-
286
- ## note: replace all non-breaking spaces with spaces for now
287
- ## see fr (france) in political parties section for example
288
- ## todo/check/fix: check if we need to use unicode char!! and NOT html entity
289
- inner_html = inner_html.gsub( "&nbsp;", ' ' )
290
- ## Unicode Character 'NO-BREAK SPACE' (U+00A0)
291
- inner_html = inner_html.gsub( "\u00A0", ' ' ) ## use unicode char
292
-
293
-
294
- el.inner_html = inner_html.rstrip + "\n"
295
-
296
- # finally - convert back to html (string)
297
- html = el.to_html
298
-
299
-
300
-
301
- html = html.gsub( ARIA_ATTR_RE ) do |m|
302
- ## do not report / keep silent for now
303
- ## puts "in >#{title}< remove aria-label attr:"
304
- ## puts "#{m}"
305
- ''
306
- end
307
-
308
- html = html.gsub( BR_BR_RE ) do |m|
309
- puts "in >#{title}< squish two <br>s into one:"
310
- puts "#{m}"
311
- '<br>'
312
- end
313
-
314
- html = html.gsub( /<br>/i ) do |m|
315
- puts "in >#{title}< replace <br> with inline (plain) text ++:"
316
- puts "#{m}"
317
- ' ++ '
318
- end
319
-
320
- ## cleanup/remove ++ before subfield e.g.
321
- ## of: ++ => of: or such
322
- ##
323
- ## todo/fix: add negative lookahead e.g. not another + to be more specific!!
324
- html = html.gsub( %r{
325
- (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
326
- \s+
327
- \+{2}}xim ) do |m|
328
- puts "in >#{title} remove ++ before <field>: marker:"
329
- puts "#{m}"
330
- ' '
331
- end
332
-
333
- #####
334
- # "unfancy" smart quotes to ascii - why? why not?
335
- # e.g.
336
- # Following Britain’s victory => Following Britain's victory
337
- html = html.tr( "’", "'" )
338
- # “full floor” House vote => "full floor" House vote
339
- html = html.tr( "“”", '""' )
340
-
341
- html
342
- end
343
-
344
- def squish( str )
345
- str.gsub( /[ \t\n\r]{2,}/, ' ' ) ## replace multi-spaces (incl. newlines with once space)
346
- end
347
-
348
-
349
- end # class Sanitizer
350
-
351
- end # module Factbook