factbook-readers 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Manifest.txt +3 -25
- data/README.md +11 -69
- data/Rakefile +3 -3
- data/lib/factbook-readers.rb +5 -40
- data/lib/factbook-readers/convert.rb +37 -0
- data/lib/factbook-readers/counter.rb +7 -9
- data/lib/factbook-readers/page.rb +41 -61
- data/lib/factbook-readers/page_info.rb +15 -3
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +3 -0
- data/test/test_counter.rb +9 -6
- data/test/test_download.rb +27 -0
- data/test/test_fields.rb +44 -27
- data/test/test_json.rb +4 -4
- data/test/test_page.rb +8 -8
- data/test/test_version.rb +15 -0
- metadata +11 -48
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook-readers/builder.rb +0 -187
- data/lib/factbook-readers/builder_item.rb +0 -201
- data/lib/factbook-readers/builder_json.rb +0 -68
- data/lib/factbook-readers/codes.rb +0 -121
- data/lib/factbook-readers/comparisons.rb +0 -49
- data/lib/factbook-readers/normalize.rb +0 -42
- data/lib/factbook-readers/reader_json.rb +0 -50
- data/lib/factbook-readers/sanitizer.rb +0 -351
- data/lib/factbook-readers/sect.rb +0 -28
- data/lib/factbook-readers/subsect.rb +0 -17
- data/lib/factbook-readers/table.rb +0 -51
- data/lib/factbook-readers/utils.rb +0 -47
- data/lib/factbook-readers/utils_info.rb +0 -128
- data/test/test_builder.rb +0 -30
- data/test/test_codes.rb +0 -72
- data/test/test_comparisons.rb +0 -16
- data/test/test_item_builder.rb +0 -97
- data/test/test_json_builder.rb +0 -23
- data/test/test_normalize.rb +0 -21
- data/test/test_sanitizer.rb +0 -36
- data/test/test_sanitizer_regex.rb +0 -87
@@ -1,68 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
######
|
5
|
-
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
6
|
-
|
7
|
-
class JsonBuilder
|
8
|
-
include LogUtils::Logging
|
9
|
-
include NormalizeHelper ## e.g. normalize_category
|
10
|
-
|
11
|
-
|
12
|
-
attr_reader :text,
|
13
|
-
:json,
|
14
|
-
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
15
|
-
:errors, ## not used yet -- encoding erros etc.
|
16
|
-
:sects
|
17
|
-
|
18
|
-
|
19
|
-
def initialize( text )
|
20
|
-
@text = text
|
21
|
-
|
22
|
-
@json = JSON.parse( text )
|
23
|
-
|
24
|
-
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
25
|
-
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
26
|
-
|
27
|
-
@sects = []
|
28
|
-
|
29
|
-
@json.each do |k1,v1|
|
30
|
-
sect_title = k1
|
31
|
-
sect_subsects = v1
|
32
|
-
|
33
|
-
sect = Sect.new
|
34
|
-
sect.title = sect_title
|
35
|
-
|
36
|
-
## get subsections
|
37
|
-
subsects = []
|
38
|
-
sect_subsects.each do |k2,v2|
|
39
|
-
subsect_title = k2
|
40
|
-
subsect_data = v2
|
41
|
-
|
42
|
-
subsect = Subsect.new
|
43
|
-
subsect.title = subsect_title
|
44
|
-
|
45
|
-
#####
|
46
|
-
## note: run data hash through normalize_category (again)
|
47
|
-
if subsect_data.is_a?( Hash )
|
48
|
-
new_subsect_data = {}
|
49
|
-
subsect_data.each do |k3,v3|
|
50
|
-
new_subsect_data[ normalize_category(k3) ] = v3
|
51
|
-
end
|
52
|
-
subsect_data = new_subsect_data
|
53
|
-
end
|
54
|
-
|
55
|
-
subsect.data = subsect_data
|
56
|
-
|
57
|
-
subsects << subsect
|
58
|
-
end
|
59
|
-
|
60
|
-
sect.subsects = subsects
|
61
|
-
@sects << sect
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
end # class JsonBuilder
|
66
|
-
|
67
|
-
|
68
|
-
end # module Factbook
|
@@ -1,121 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# note:
|
3
|
-
# the factbook category/region for world is other entities (on FAQ) and oceans in page
|
4
|
-
# changed to world
|
5
|
-
|
6
|
-
|
7
|
-
module Factbook
|
8
|
-
|
9
|
-
class Codes
|
10
|
-
|
11
|
-
Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
|
12
|
-
:name,
|
13
|
-
:category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
|
14
|
-
:region, ## e.g. Europe, Oceans, etc.
|
15
|
-
)
|
16
|
-
|
17
|
-
def self.read_csv( path )
|
18
|
-
###
|
19
|
-
# note:
|
20
|
-
# if you use quotes - NO leading spaces allowed e.g.
|
21
|
-
# use au,"Austria",... and NOT
|
22
|
-
# au, "Austria", ...
|
23
|
-
#
|
24
|
-
# for headers - NO leading spaces allowed e.g.
|
25
|
-
# use Code,Name,Category,Region,... and NOT
|
26
|
-
# Code, Name, Category, Region, ...
|
27
|
-
|
28
|
-
rows = CsvHash.read( path )
|
29
|
-
|
30
|
-
pp rows
|
31
|
-
|
32
|
-
recs = []
|
33
|
-
rows.each do |row|
|
34
|
-
pp row
|
35
|
-
rec = Code.new
|
36
|
-
rec.code = row['Code'].strip ## remove leading n trailing whitespaces
|
37
|
-
rec.name = row['Name'].strip
|
38
|
-
|
39
|
-
## note: for now category and region are optional
|
40
|
-
rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
|
41
|
-
rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
|
42
|
-
|
43
|
-
pp rec
|
44
|
-
recs << rec
|
45
|
-
end
|
46
|
-
|
47
|
-
new( recs )
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
def initialize( codes )
|
52
|
-
@codes = codes
|
53
|
-
end
|
54
|
-
|
55
|
-
def size() @codes.size; end
|
56
|
-
|
57
|
-
def each( &blk ) @codes.each( &blk ); end
|
58
|
-
def select( &blk )
|
59
|
-
codes = @codes.select( &blk )
|
60
|
-
Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
|
61
|
-
end
|
62
|
-
|
63
|
-
|
64
|
-
def to_a
|
65
|
-
@codes.collect {|code| code.code } ## return array of codes
|
66
|
-
end
|
67
|
-
|
68
|
-
## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
|
69
|
-
|
70
|
-
## "pre-defined" convenience shortcuts
|
71
|
-
def countries() category 'Countries'; end
|
72
|
-
def world() category 'World'; end
|
73
|
-
def oceans() category 'Oceans'; end
|
74
|
-
def misc() category 'Miscellaneous'; end
|
75
|
-
def others() category 'Other'; end
|
76
|
-
def dependencies() category 'Dependencies'; end
|
77
|
-
def dependencies_us() category 'Dependencies (United States)'; end
|
78
|
-
## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
|
79
|
-
|
80
|
-
def europe() region 'Europe'; end
|
81
|
-
def south_asia() region 'South Asia'; end
|
82
|
-
def central_asia() region 'Central Asia'; end
|
83
|
-
def east_n_souteast_asia() region 'East & Southeast Asia'; end
|
84
|
-
def middle_east() region 'Middle East'; end
|
85
|
-
def africa() region 'Africa'; end
|
86
|
-
def north_america() region 'North America'; end
|
87
|
-
def central_america_n_caribbean() region 'Central America and Caribbean'; end
|
88
|
-
def south_america() region 'South America'; end
|
89
|
-
def australia_oceania() region 'Australia-Oceania'; end
|
90
|
-
def antartica() region 'Antarctica'; end
|
91
|
-
|
92
|
-
## note: regions oceans and world - same as category oceans and world
|
93
|
-
## use oceans_ii or world_ii or something ??
|
94
|
-
## use category('World') n region('World')
|
95
|
-
## use category('Oceans') n region('Oceans')
|
96
|
-
|
97
|
-
|
98
|
-
def category( query )
|
99
|
-
## todo/future: allow passing in of regex too (not just string)
|
100
|
-
## note: e.g. Dependencies (France) needs to get escpaed to
|
101
|
-
## Dependencies \(France\) etc.
|
102
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
103
|
-
codes = @codes.select do |code|
|
104
|
-
code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
|
105
|
-
end
|
106
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
107
|
-
end
|
108
|
-
|
109
|
-
def region( query )
|
110
|
-
## todo/future: allow passing in of regex too (not just string)
|
111
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
112
|
-
codes = @codes.select do |code|
|
113
|
-
code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
|
114
|
-
end
|
115
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
116
|
-
end
|
117
|
-
|
118
|
-
end # class codes
|
119
|
-
|
120
|
-
end # module Factbook
|
121
|
-
|
@@ -1,49 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class Comparisons
|
5
|
-
|
6
|
-
Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
|
7
|
-
:category, ## e.g. Geography, People, Economy, etc.
|
8
|
-
:name,
|
9
|
-
)
|
10
|
-
|
11
|
-
def self.read_csv( path )
|
12
|
-
|
13
|
-
rows = CsvHash.read( path )
|
14
|
-
|
15
|
-
pp rows
|
16
|
-
|
17
|
-
recs = []
|
18
|
-
rows.each do |row|
|
19
|
-
pp row
|
20
|
-
rec = Comparison.new
|
21
|
-
rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
|
22
|
-
rec.category = row['Category'].strip
|
23
|
-
rec.name = row['Name'].strip
|
24
|
-
|
25
|
-
pp rec
|
26
|
-
recs << rec
|
27
|
-
end
|
28
|
-
|
29
|
-
new( recs )
|
30
|
-
end
|
31
|
-
|
32
|
-
def initialize( comps )
|
33
|
-
@comps = comps
|
34
|
-
end
|
35
|
-
|
36
|
-
def size() @comps.size; end
|
37
|
-
|
38
|
-
def each
|
39
|
-
@comps.each {|comp| yield( comp ) }
|
40
|
-
end
|
41
|
-
|
42
|
-
def to_a
|
43
|
-
@comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
|
44
|
-
end
|
45
|
-
|
46
|
-
end # class Comparison
|
47
|
-
|
48
|
-
end # module Factbook
|
49
|
-
|
@@ -1,42 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
module NormalizeHelper
|
4
|
-
|
5
|
-
|
6
|
-
def normalize_category( text )
|
7
|
-
|
8
|
-
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
9
|
-
|
10
|
-
text = text.strip
|
11
|
-
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
12
|
-
text = text.strip
|
13
|
-
|
14
|
-
#######################################
|
15
|
-
### special cases
|
16
|
-
|
17
|
-
## typos e.g ntoe => use note
|
18
|
-
text = 'note' if text == 'ntoe'
|
19
|
-
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
20
|
-
|
21
|
-
## downcase
|
22
|
-
text = 'lowest point' if text == 'Lowest point'
|
23
|
-
text = 'chief of state' if text == 'Chief of state'
|
24
|
-
|
25
|
-
## spelling variant (use more popular one)
|
26
|
-
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
27
|
-
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
28
|
-
text = 'water contact disease' if text == 'water contact diseases'
|
29
|
-
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
30
|
-
text = 'geographic coordinates' if text == 'geographical coordinates'
|
31
|
-
text = 'note' if text == 'notes'
|
32
|
-
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
33
|
-
|
34
|
-
## border countries (8): -- remove (x) counter
|
35
|
-
text = 'border countries' if text.start_with?( 'border countries')
|
36
|
-
|
37
|
-
text
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
end # module NormalizeHelper
|
42
|
-
end # module Factbook
|
@@ -1,50 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
class JsonPageReader
|
6
|
-
def initialize( json_dir )
|
7
|
-
@json_dir = json_dir
|
8
|
-
end
|
9
|
-
|
10
|
-
def read_page( code )
|
11
|
-
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
12
|
-
|
13
|
-
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
14
|
-
json = File.read( path, 'r:utf-8' ) { |f| f.read }
|
15
|
-
|
16
|
-
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
17
|
-
# add some page info from code struct
|
18
|
-
|
19
|
-
info = PageInfo.new
|
20
|
-
info.country_code = code.code
|
21
|
-
info.country_name = code.name
|
22
|
-
info.region_name = code.region
|
23
|
-
|
24
|
-
page = Page.new( code.code, json: json, info: info )
|
25
|
-
page
|
26
|
-
end
|
27
|
-
|
28
|
-
def read_pages( codes, limit: nil )
|
29
|
-
pages = []
|
30
|
-
i=0
|
31
|
-
codes.each do |code|
|
32
|
-
next if limit && i > limit ## for debugging just process first x entries
|
33
|
-
|
34
|
-
pages << read_page( code )
|
35
|
-
end
|
36
|
-
pages
|
37
|
-
end
|
38
|
-
|
39
|
-
private
|
40
|
-
def region_to_slug( text )
|
41
|
-
## change and => n
|
42
|
-
## change & => n
|
43
|
-
## change all spaces to => -
|
44
|
-
## e.g. East & Southeast Asia => east-n-southeast-asia
|
45
|
-
## Central America and Caribbean => central-america-n-caribbean
|
46
|
-
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
47
|
-
end
|
48
|
-
end ## JsonPageReader
|
49
|
-
|
50
|
-
end # module Factbook
|
@@ -1,351 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class Sanitizer
|
5
|
-
include LogUtils::Logging
|
6
|
-
include Utils ## e.g. find_page_info etc.
|
7
|
-
|
8
|
-
def sanitize( html )
|
9
|
-
## todo: add option for (html source) encoding - why?? why not??
|
10
|
-
|
11
|
-
## note:
|
12
|
-
## returns 1) html profile withouth headers, footers, scripts,etc.
|
13
|
-
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
14
|
-
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
15
|
-
|
16
|
-
page_info = PageInfo.new
|
17
|
-
|
18
|
-
## todo:
|
19
|
-
## make page info optional? why? why not?
|
20
|
-
## not always available (if page structure changes) - check
|
21
|
-
## what page info is required??
|
22
|
-
h = find_page_info( html )
|
23
|
-
if h
|
24
|
-
page_info.country_code = h[:country_code]
|
25
|
-
page_info.country_name = h[:country_name]
|
26
|
-
page_info.country_affiliation = h[:country_affiliation]
|
27
|
-
page_info.region_code = h[:region_code]
|
28
|
-
page_info.region_name = h[:region_name]
|
29
|
-
else
|
30
|
-
page_info.country_code = find_country_code( html )
|
31
|
-
## print/warn: no page info found
|
32
|
-
end
|
33
|
-
|
34
|
-
|
35
|
-
page_info.last_updated = find_page_last_updated( html )
|
36
|
-
|
37
|
-
|
38
|
-
html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
|
39
|
-
|
40
|
-
## todo/check: remove 3rd args old errors array - why? why not?
|
41
|
-
[html_profile, page_info, []]
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
def find_country_profile( html )
|
47
|
-
####
|
48
|
-
## remove header (everything before)
|
49
|
-
## <ul class="expandcollapse">
|
50
|
-
|
51
|
-
##
|
52
|
-
## fix know broken html bugs
|
53
|
-
## in co (Columbia) page (Nov/11 2020):
|
54
|
-
## <div class="photogallery_captiontext">
|
55
|
-
## <p>slightly less than twice the size of Texas</p
|
56
|
-
## </div>
|
57
|
-
## note: </p => unclosed p!! change to </p>
|
58
|
-
|
59
|
-
## note: in regex use negative looakhead e.g. (?!patttern)
|
60
|
-
html = html.gsub( %r{</p(?![>])} ) do |m|
|
61
|
-
puts "!! WARN: fixing unclosed </p => </p>"
|
62
|
-
puts "#{m}"
|
63
|
-
'</p>'
|
64
|
-
end
|
65
|
-
|
66
|
-
|
67
|
-
doc = Nokogiri::HTML( html )
|
68
|
-
|
69
|
-
ul = doc.css( 'ul.expandcollapse' )[0]
|
70
|
-
|
71
|
-
puts ul.to_html[0..100]
|
72
|
-
|
73
|
-
|
74
|
-
###
|
75
|
-
## sanitize
|
76
|
-
## remove link items
|
77
|
-
## assume two <li>s are a section
|
78
|
-
|
79
|
-
html = String.new('')
|
80
|
-
|
81
|
-
## filter all li's
|
82
|
-
ul_children = ul.children.select { |el| if el.name == 'li'
|
83
|
-
true
|
84
|
-
else
|
85
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
86
|
-
false
|
87
|
-
end
|
88
|
-
}
|
89
|
-
## ul_children = ul.css( 'li' )
|
90
|
-
|
91
|
-
puts " #{ul_children.size} li(s):"
|
92
|
-
ul_children.each_slice(2) do |lis|
|
93
|
-
li = lis[0]
|
94
|
-
div = li.at( 'div[sectiontitle]' )
|
95
|
-
if div.nil?
|
96
|
-
puts "!! ERROR: no section title found in div:"
|
97
|
-
puts li.to_html
|
98
|
-
exit 1
|
99
|
-
end
|
100
|
-
|
101
|
-
section_title = div['sectiontitle'].to_s
|
102
|
-
|
103
|
-
html << "<h2>#{section_title}</h2>\n"
|
104
|
-
|
105
|
-
|
106
|
-
li = lis[1]
|
107
|
-
## filter all div's
|
108
|
-
li_children = li.children.select { |el| if el.name =='div'
|
109
|
-
true
|
110
|
-
else
|
111
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
112
|
-
false
|
113
|
-
end
|
114
|
-
}
|
115
|
-
puts " #{li_children.size} div(s) in >#{section_title}<:"
|
116
|
-
|
117
|
-
|
118
|
-
## check special case in world Geographic overview:
|
119
|
-
# <div class="category oce_light" style="padding-left:5px;"
|
120
|
-
# id="field-anchor-geography-geographic-overview">
|
121
|
-
# Geographic overview:
|
122
|
-
# <span class="field-listing-link">
|
123
|
-
# <a href="../fields/275.html#XX">
|
124
|
-
# <img alt="Geographic overview field listing"
|
125
|
-
# title="Geographic overview field listing"
|
126
|
-
# src="../images/field_listing.gif" /></a>
|
127
|
-
# </span>
|
128
|
-
#</div>
|
129
|
-
# vs regular
|
130
|
-
#
|
131
|
-
# <div class="category oce_light" style="padding-left:5px;"
|
132
|
-
# id="field-anchor-geography-area-comparative">
|
133
|
-
# <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'>
|
134
|
-
# <a aria-label="Use this link to access a description of the Area - comparative field"
|
135
|
-
# href="../docs/notesanddefs.html#280">
|
136
|
-
# Area - comparative
|
137
|
-
# </a>:
|
138
|
-
# <span class="tooltip-content">
|
139
|
-
# This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres).
|
140
|
-
# </span>
|
141
|
-
# </span>
|
142
|
-
# <span class="field-listing-link">
|
143
|
-
# <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a>
|
144
|
-
# </span>
|
145
|
-
# </div>
|
146
|
-
|
147
|
-
li_children.each_slice(2) do |divs|
|
148
|
-
div = divs[0]
|
149
|
-
|
150
|
-
## try new way - try clean-up / rm first
|
151
|
-
span_tooltip_content = div.at( 'span.tooltip-content' )
|
152
|
-
if span_tooltip_content
|
153
|
-
span_tooltip_content.inner_html = ''
|
154
|
-
span_tooltip_content.replace( '' ) ## check for how to delete/remove - why? why not!!
|
155
|
-
end
|
156
|
-
|
157
|
-
span_field_listing_link = div.at( 'span.field-listing-link' )
|
158
|
-
if span_field_listing_link
|
159
|
-
span_field_listing_link.inner_html = ''
|
160
|
-
span_field_listing_link.replace( '' )
|
161
|
-
end
|
162
|
-
|
163
|
-
subsection_title = div.text.strip
|
164
|
-
html << "\n<h3>#{subsection_title}</h3>\n"
|
165
|
-
|
166
|
-
# a = div.css('a')[0]
|
167
|
-
# if a
|
168
|
-
# subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
|
169
|
-
# html << "\n<h3>#{subsection_title}:</h3>\n"
|
170
|
-
# else
|
171
|
-
# subsection_title = '???'
|
172
|
-
# puts "!! WARN: no anchor found:"
|
173
|
-
# puts div.to_html
|
174
|
-
# end
|
175
|
-
|
176
|
-
|
177
|
-
div = divs[1]
|
178
|
-
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
179
|
-
puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
|
180
|
-
|
181
|
-
## use more robust version - only get divs with category_data
|
182
|
-
## div_children = div.css( 'div.category_data' )
|
183
|
-
## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
|
184
|
-
|
185
|
-
# if div_children.size > 14
|
186
|
-
# ## us labor force has 11 divs
|
187
|
-
# ## possibly an error
|
188
|
-
# puts "!! ERROR - too many category_data divs found:"
|
189
|
-
# puts div.to_html[0..200]
|
190
|
-
# puts "\n...\n"
|
191
|
-
# puts puts div.to_html[-400..-1]
|
192
|
-
# exit 1
|
193
|
-
# end
|
194
|
-
|
195
|
-
div_children.each do |catdiv|
|
196
|
-
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
197
|
-
|
198
|
-
if catdiv['class'].index( 'attachment' )
|
199
|
-
## skip attachments e.g. maps, pop pyramids, etc.
|
200
|
-
else
|
201
|
-
html << sanitize_data( catdiv, title: subsection_title )
|
202
|
-
html << "\n"
|
203
|
-
end
|
204
|
-
else
|
205
|
-
if catdiv.to_html.index( 'country comparison to the world' )
|
206
|
-
## simplify/unlinkify country comparision
|
207
|
-
## <div>
|
208
|
-
## <span class='category'>country comparison to the world:</span>
|
209
|
-
## <span class='category_data'>
|
210
|
-
## <a href="../fields/335rank.html#AU">97</a>
|
211
|
-
## </span>
|
212
|
-
## </div>
|
213
|
-
## e.g. to =>
|
214
|
-
## <div>
|
215
|
-
## country comparison to the world: 97
|
216
|
-
## </div>
|
217
|
-
html << "<div>\n #{squish( catdiv.text.strip )}\n</div>"
|
218
|
-
html << "\n"
|
219
|
-
else
|
220
|
-
puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
|
221
|
-
puts catdiv.to_html
|
222
|
-
exit 1
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
|
-
html
|
230
|
-
end
|
231
|
-
|
232
|
-
|
233
|
-
#
|
234
|
-
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
235
|
-
#
|
236
|
-
# remove aria labels
|
237
|
-
ARIA_ATTR_RE = /\s*
|
238
|
-
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
239
|
-
/xim ## do NOT allow multi-line - why? why not?
|
240
|
-
|
241
|
-
## find double breaks e.g. <br><br>
|
242
|
-
BR_BR_RE = /(<br> \s* <br>)
|
243
|
-
/xim ## do NOT allow multi-line - why? why not?
|
244
|
-
|
245
|
-
|
246
|
-
def sanitize_data( el, title: )
|
247
|
-
## todo/fix/check:
|
248
|
-
## check if more than one p(aragraph)
|
249
|
-
## get squezzed together without space inbetween?
|
250
|
-
|
251
|
-
|
252
|
-
## step 0: replace all possible a(nchor) links with just inner text
|
253
|
-
el.css( 'a').each do |a|
|
254
|
-
a.replace( " #{a.text.strip} " )
|
255
|
-
end
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
inner_html = String.new('')
|
260
|
-
|
261
|
-
## step 1 - unwrap paragraphs if present
|
262
|
-
## and convert dom/nokogiri doc/tree to html string
|
263
|
-
p_count = 0
|
264
|
-
el.children.each do |child|
|
265
|
-
if child.name == 'p'
|
266
|
-
## puts " [debug ] unwrap <p> no.#{p_count+1}"
|
267
|
-
|
268
|
-
p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
|
269
|
-
if p_inner_html.empty?
|
270
|
-
## note: skip empty paragraphs for now
|
271
|
-
else
|
272
|
-
inner_html << ' ++ ' if p_count > 0
|
273
|
-
inner_html << p_inner_html
|
274
|
-
inner_html << " \n\n "
|
275
|
-
|
276
|
-
p_count += 1
|
277
|
-
end
|
278
|
-
else
|
279
|
-
inner_html << child.to_html
|
280
|
-
end
|
281
|
-
end
|
282
|
-
## note: keep container div!! just replace inner html!!!
|
283
|
-
## note: right strip all trailing spaces/newlines for now
|
284
|
-
## plus add back a single one for pretty printing
|
285
|
-
|
286
|
-
## note: replace all non-breaking spaces with spaces for now
|
287
|
-
## see fr (france) in political parties section for example
|
288
|
-
## todo/check/fix: check if we need to use unicode char!! and NOT html entity
|
289
|
-
inner_html = inner_html.gsub( " ", ' ' )
|
290
|
-
## Unicode Character 'NO-BREAK SPACE' (U+00A0)
|
291
|
-
inner_html = inner_html.gsub( "\u00A0", ' ' ) ## use unicode char
|
292
|
-
|
293
|
-
|
294
|
-
el.inner_html = inner_html.rstrip + "\n"
|
295
|
-
|
296
|
-
# finally - convert back to html (string)
|
297
|
-
html = el.to_html
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
html = html.gsub( ARIA_ATTR_RE ) do |m|
|
302
|
-
## do not report / keep silent for now
|
303
|
-
## puts "in >#{title}< remove aria-label attr:"
|
304
|
-
## puts "#{m}"
|
305
|
-
''
|
306
|
-
end
|
307
|
-
|
308
|
-
html = html.gsub( BR_BR_RE ) do |m|
|
309
|
-
puts "in >#{title}< squish two <br>s into one:"
|
310
|
-
puts "#{m}"
|
311
|
-
'<br>'
|
312
|
-
end
|
313
|
-
|
314
|
-
html = html.gsub( /<br>/i ) do |m|
|
315
|
-
puts "in >#{title}< replace <br> with inline (plain) text ++:"
|
316
|
-
puts "#{m}"
|
317
|
-
' ++ '
|
318
|
-
end
|
319
|
-
|
320
|
-
## cleanup/remove ++ before subfield e.g.
|
321
|
-
## of: ++ => of: or such
|
322
|
-
##
|
323
|
-
## todo/fix: add negative lookahead e.g. not another + to be more specific!!
|
324
|
-
html = html.gsub( %r{
|
325
|
-
(?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
|
326
|
-
\s+
|
327
|
-
\+{2}}xim ) do |m|
|
328
|
-
puts "in >#{title} remove ++ before <field>: marker:"
|
329
|
-
puts "#{m}"
|
330
|
-
' '
|
331
|
-
end
|
332
|
-
|
333
|
-
#####
|
334
|
-
# "unfancy" smart quotes to ascii - why? why not?
|
335
|
-
# e.g.
|
336
|
-
# Following Britain’s victory => Following Britain's victory
|
337
|
-
html = html.tr( "’", "'" )
|
338
|
-
# “full floor” House vote => "full floor" House vote
|
339
|
-
html = html.tr( "“”", '""' )
|
340
|
-
|
341
|
-
html
|
342
|
-
end
|
343
|
-
|
344
|
-
def squish( str )
|
345
|
-
str.gsub( /[ \t\n\r]{2,}/, ' ' ) ## replace multi-spaces (incl. newlines with once space)
|
346
|
-
end
|
347
|
-
|
348
|
-
|
349
|
-
end # class Sanitizer
|
350
|
-
|
351
|
-
end # module Factbook
|