factbook 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,148 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- ## note:
6
- ## some factbook pages with chrome (headers, footers, etc.)
7
- ## are NOT valid utf-8, thus,
8
- ## treat page as is (e.g. ASCII8BIT)
9
- #
10
- # only convert to utf8 when header and footer got stripped
11
-
12
- ##
13
- ## be/benin:
14
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
- #
16
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
- # Lazare Sèhouéto
18
- #
19
- # looks good - use (assume) Windows-1252 ????
20
-
21
- ##
22
- # check for is ascii 7-bit ??? if yes -noworries
23
- # if not, log number of chars not using ascii 7-bit
24
-
25
-
26
-
27
- class Page
28
- include LogUtils::Logging
29
-
30
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
31
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
32
- attr_reader :data ## "plain" access with vanilla hash
33
-
34
-
35
- ## standard version (note: requires https)
36
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
-
38
- def initialize( code, opts={} )
39
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
-
41
- if opts[:json]
42
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
- b = JsonBuilder.from_string( json )
44
- else ## assume html
45
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
- ## for debugging and testing allow "custom" passed-in html page
47
- html = opts[:html]
48
- else
49
- url_string = SITE_BASE.gsub( '{code}', code )
50
- ## note: expects ASCII-7BIT/BINARY encoding
51
-
52
- ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- html = Webcache.read( url_string )
54
- end
55
- b = Builder.from_string( html )
56
- end
57
-
58
- @sects = b.sects
59
- @info = b.info
60
-
61
- ## todo/fix/quick hack:
62
- ## check for info opts hash entry - lets you overwrite page info
63
- ## -- use proper header to setup page info - why, why not??
64
- if opts[:info]
65
- info = opts[:info]
66
- @info = info
67
- end
68
-
69
- @data = {}
70
- @sects.each do |sect|
71
- @data[ sect.title ] = sect.data
72
- end
73
-
74
- self ## return self (check - not needed??)
75
- end
76
-
77
-
78
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
- if opts[:minify]
80
- data.to_json
81
- else
82
- ## was: -- opts[:pretty] || opts[:pp]
83
- JSON.pretty_generate( data ) ## note: pretty print by default!
84
- end
85
- end
86
-
87
-
88
- def [](key) ### convenience shortcut
89
- # lets you use
90
- # page['geo']
91
- # instead of
92
- # page.data['geo']
93
-
94
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
95
-
96
- data[key]
97
- end
98
-
99
- ## add convenience (shortcut) accessors / attributes / fields / getters
100
-
101
- ATTRIBUTES.each do |attrib|
102
- ## e.g.
103
- ## def background() data['Introduction']['Background']['text']; end
104
- ## def location() data['Geography']['Location']['text']; end
105
- ## etc.
106
- if attrib.path.size == 1
107
- define_method attrib.name.to_sym do
108
- @data.fetch( attrib.category, {} ).
109
- fetch( attrib.path[0], {} )['text']
110
- end
111
- else ## assume size 2 for now
112
- define_method attrib.name.to_sym do
113
- @data.fetch( attrib.category, {} ).
114
- fetch( attrib.path[0], {} ).
115
- fetch( attrib.path[1], {} )['text']
116
- end
117
- end
118
- end
119
-
120
-
121
- private
122
- def fetch_page( url )
123
- response = Webget.page( url )
124
-
125
- ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
- exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
-
128
-
129
- response.text
130
- end
131
-
132
-
133
- =begin
134
- def self.from_url( cc, cn )
135
- html_ascii = PageFetcher.new.fetch( cc )
136
- self.new( cc, cn, html_ascii )
137
- end
138
-
139
- def self.from_file( cc, cn, opts={} )
140
- input_dir = opts[:input_dir] || '.'
141
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
- self.new( cc, cn, html_ascii )
143
- end
144
- =end
145
-
146
-
147
- end # class Page
148
- end # module Factbook
@@ -1,12 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- PageInfo = Struct.new( :country_code,
6
- :country_name,
7
- :country_affiliation,
8
- :region_code,
9
- :region_name,
10
- :last_updated )
11
-
12
- end # module Factbook
@@ -1,51 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class JsonPageReader
7
- def initialize( json_dir )
8
- @json_dir = json_dir
9
- end
10
-
11
- def read_page( code )
12
- path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
-
14
- puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
- json = File.read( path )
16
-
17
- ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
- # add some page info from code struct
19
-
20
- info = PageInfo.new
21
- info.country_code = code.code
22
- info.country_name = code.name
23
- info.region_name = code.region
24
-
25
- page = Page.new( code.code, json: json, info: info )
26
- page
27
- end
28
-
29
- def read_pages( codes, limit: nil )
30
- pages = []
31
- i=0
32
- codes.each do |code|
33
- next if limit && i > limit ## for debugging just process first x entries
34
-
35
- pages << read_page( code )
36
- end
37
- pages
38
- end
39
-
40
- private
41
- def region_to_slug( text )
42
- ## change and => n
43
- ## change & => n
44
- ## change all spaces to => -
45
- ## e.g. East & Southeast Asia => east-n-southeast-asia
46
- ## Central America and Caribbean => central-america-n-caribbean
47
- text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
- end
49
- end ## JsonPageReader
50
-
51
- end # module Factbook
@@ -1,178 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Sanitizer
5
- include LogUtils::Logging
6
- include Utils ## pulls in encode_utf8, ...
7
-
8
-
9
- def sanitize( html_ascii )
10
- ## todo: add option for (html source) encoding - why?? why not??
11
-
12
- ## note:
13
- ## returns 1) html profile withouth headers, footers, scripts,etc.
14
- ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
15
- ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
16
-
17
- page_info = PageInfo.new
18
-
19
- ## todo:
20
- ## make page info optional? why? why not?
21
- ## not always available (if page structure changes) - check
22
- ## what page info is required??
23
- h = find_page_info( html_ascii )
24
- if h
25
- page_info.country_code = h[:country_code]
26
- page_info.country_name = h[:country_name]
27
- page_info.country_affiliation = h[:country_affiliation]
28
- page_info.region_code = h[:region_code]
29
- page_info.region_name = h[:region_name]
30
- else
31
- page_info.country_code = find_country_code( html_ascii )
32
- ## print/warn: no page info found
33
- end
34
-
35
-
36
- page_info.last_updated = find_page_last_updated( html_ascii )
37
-
38
-
39
- html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
40
-
41
- ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
42
- # html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
43
-
44
- # html = sanitize_profile( html )
45
-
46
- [html, page_info, []]
47
- end
48
-
49
-
50
- #
51
- # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
52
- #
53
- # remove aria labels
54
- ARIA_ATTR_REGEX = /\s*
55
- aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
56
- /xim ## do NOT allow multi-line - why? why not?
57
-
58
-
59
- def find_country_profile( html )
60
- ####
61
- ## remove header (everything before)
62
- ## <ul class="expandcollapse">
63
-
64
- doc = Nokogiri::HTML( html )
65
-
66
- ul = doc.css( 'ul.expandcollapse' )[0]
67
-
68
- puts ul.to_html[0..100]
69
-
70
-
71
-
72
- ## note: special case cc uses h2 instead of div block
73
- ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
74
- ## style="border-bottom: 2px solid white; cursor: pointer;">
75
- ## Introduction :: <span class="region">CURACAO </span>
76
- ## </h2>
77
- ## is old format !!!!
78
- ## cc - CURACAO
79
- ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
80
- ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
81
- ## wait for new version to be generated / pushed!!!
82
-
83
- ## check for old format if h2 are present
84
- h2s = ul.css( 'h2' )
85
- if h2s.size > 0
86
- puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
87
- ## return empty html string - why? why not?
88
- return ''
89
- end
90
-
91
-
92
- ###
93
- ## sanitize
94
-
95
- ## remove link items
96
- ## assume two <li>s are a section
97
-
98
- html = String.new('')
99
-
100
- ## filter all li's
101
- ul_children = ul.children.select { |el| if el.name == 'li'
102
- true
103
- else
104
- # puts "skipping #{el.name} >#{el.to_html}<"
105
- false
106
- end
107
- }
108
- puts " #{ul_children.size} li(s):"
109
- ul_children.each_slice(2) do |lis|
110
- li = lis[0]
111
- div = li.at( 'div[sectiontitle]' )
112
- if div.nil?
113
- puts "!! ERROR: no section title found in div:"
114
- puts li.to_html
115
- exit 1
116
- end
117
-
118
- section_title = div['sectiontitle'].to_s
119
-
120
- html << "<h2>#{section_title}</h2>\n"
121
-
122
-
123
- li = lis[1]
124
- ## filter all div's
125
- li_children = li.children.select { |el| if el.name =='div'
126
- true
127
- else
128
- # puts "skipping #{el.name} >#{el.to_html}<"
129
- false
130
- end
131
- }
132
- puts " #{li_children.size} div(s):"
133
-
134
- li_children.each_slice(2) do |divs|
135
- div = divs[0]
136
- a = div.css('a')[0]
137
-
138
- if a
139
- html << "\n<h3>#{a.text}:</h3>\n"
140
- else
141
- puts "!! WARN: no anchor found:"
142
- puts div.to_html
143
- end
144
-
145
-
146
- div = divs[1]
147
- div_children = div.children.select {|el| el.name == 'div' ? true : false }
148
- div_children.each do |catdiv|
149
- if catdiv['class'] && catdiv['class'].index( 'category_data' )
150
-
151
- if catdiv['class'].index( 'attachment' )
152
- ## skip attachments e.g. maps, pop pyramids, etc.
153
- else
154
- html << catdiv.to_html
155
- html << "\n"
156
- end
157
- else
158
- puts "!! WARN: skipping div (W/O category_data class):"
159
- puts catdiv.to_html
160
- end
161
- end
162
- end
163
- end
164
-
165
-
166
- html = html.gsub( ARIA_ATTR_REGEX ) do |m|
167
- puts "remove aria-label attr:"
168
- puts "#{m}"
169
- ''
170
- end
171
-
172
- html
173
- end
174
-
175
-
176
- end # class Sanitizer
177
-
178
- end # module Factbook
@@ -1,29 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Sect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :subsects
11
-
12
- def initialize
13
- @subsects = []
14
- end
15
-
16
- def data
17
- ## convert sects to hash
18
- @data = {}
19
-
20
- subsects.each_with_index do |subsect,i|
21
- @data[ subsect.title ] = subsect.data
22
- end
23
- @data
24
- end
25
-
26
-
27
- end # class Sect
28
-
29
- end # module Factbook
@@ -1,18 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Subsect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
-
12
- def initialize
13
- @data = {}
14
- end
15
-
16
- end # class Subsect
17
-
18
- end # module Factbook
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- ##
6
- ## make more "generic" - why? why not?
7
- ## (re)use for other files ?? move to textutils ??
8
-
9
- ##
10
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
- ## 1 China 1,367,485,388
13
- ## 2 India 1,251,695,584
14
- ## 3 European Union 513,949,445
15
- ## 4 United States 321,368,864
16
- ## 5 Indonesia 255,993,674
17
- ## 6 Brazil 204,259,812
18
-
19
-
20
- class TableReader
21
- include LogUtils::Logging
22
-
23
-
24
- def initialize( text )
25
- @text = text
26
- end
27
-
28
- def read
29
- recs = []
30
-
31
- line_no = 0
32
- @text.each_line do |line|
33
- line_no +=1
34
- line = line.strip ## remove leading and trailing whitespace
35
- if line.empty?
36
- puts "** skipping empty line #{line_no}"
37
- next
38
- end
39
-
40
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
-
42
- ## puts line
43
- ## pp values
44
- recs << values
45
- end
46
- recs
47
- end
48
-
49
-
50
- end # class TableReader
51
-
52
- end # module Factbook