factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
data/lib/factbook/sect.rb CHANGED
@@ -1,196 +1,29 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
6
- include LogUtils::Logging
7
-
8
- attr_reader :title, :html
9
-
10
- def initialize( title, html, opts={} )
11
- ## todo: passing a ref to the parent page - why? why not??
12
- @title = title
13
- @html = html
14
- @opts = opts # fields: full|long|keep|std|??? -- find a good name for the option keeping field names as is
15
-
16
- @doc = nil
17
- @data = nil
18
- end
19
-
20
- def doc
21
- ### check: use nokogiri html fragment? why? why not??
22
- @doc ||= Nokogiri::HTML( @html )
23
- end
24
-
25
- def data
26
- @data ||= sect_to_hash( doc )
27
- end
28
-
29
- private
30
-
31
- def cleanup_key( key )
32
-
33
- if @opts[:fields] # if set assume full|long|keep for now
34
- ### kepe field names as is
35
- ## e.g.
36
- ## GDP - composition, by sector of origin:
37
- ## Budget surplus (+) or deficit (-):
38
- ## becomes:
39
- ## GDP - composition, by sector of origin
40
- ## Budget surplus (+) or deficit (-)
41
- key = key.strip
42
- key = key.gsub( /[ ]{2,}/, ' ' ) # fold two plus spaces into one -- check if exists?
43
- key = key.gsub( /:\z/, '' ) # remove trailing : if present
44
- key = key.strip
45
- else
46
- ## to lower case
47
- key = key.downcase
48
- ## seaport(s) => seaports
49
- key = key.gsub( '(s)', 's' )
50
- key = key.gsub( ':', '' ) # trailing : ## fix: use regex /:$/ w/ anchor??
51
- ## remove special chars ()+-/,'
52
- key = key.gsub( /['()+\-\/,]/, ' ' )
53
- key = key.strip
54
- key = key.gsub( /[ ]+/, '_' )
55
- end
56
-
57
- key
58
- end
59
-
60
-
61
- def sect_to_hash( sect )
62
-
63
- rows = sect.css( 'table tr' )
64
- cells = sect.css( 'table tr td' )
65
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
66
- data_ids = rows.css( '#data' )
67
-
68
- logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
69
-
70
- hash = {}
71
- last_cat = nil
72
-
73
- cells.each_with_index do |cell,i|
74
- ## next if i > 14 ## skip after xx for debugging for now
75
-
76
- # check if field or data id
77
- # check for (nested) div#field in td
78
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
79
-
80
- # check for td#data
81
- has_data_id = cell['id'] == 'data' ? true : false
82
-
83
- if has_field_id
84
-
85
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
86
- if cats.size == 1
87
- text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
88
- last_cat = text
89
- logger.debug " [#{i}] category: >>#{text}<<"
90
- else
91
- logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
92
- logger.warn cell.to_s
93
- end
94
-
95
- elsif has_data_id
96
-
97
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
98
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
99
- cats_div_data = cell.css( 'div.category_data' )
100
- cats_span_data = cell.css( 'span.category_data' )
101
-
102
- logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
103
-
104
- pairs = []
105
- last_pair = nil
106
- last_pair_data_count = 0
107
-
108
- ## loop over div blocks (might be .category or .category_data)
109
- cell.children.each_with_index do |child,j|
110
- unless child.element?
111
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
112
- ## puts child.to_s
113
- next
114
- end
115
- unless child.name == 'div'
116
- logger.warn " **** !!! skipping non-div >#{child.name}<:"
117
- logger.warn child.to_s
118
- next
119
- end
120
-
121
- ### check if .category or .category_data
122
- if child['class'] == 'category'
123
-
124
- ## collect text for category; exclude element w/ class.category_data
125
- text = ""
126
- child.children.each do |subchild|
127
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
128
- end
129
- text = cleanup_key( text )
130
-
131
- value = child.css('span.category_data').text.strip
132
-
133
- logger.debug " -- category >>#{text}<<"
134
-
135
- ## start new pair
136
- last_pair = [ text, value ]
137
- last_pair_data_count = 0
138
- pairs << last_pair
139
-
140
- elsif child['class'] == 'category_data'
141
- logger.debug " -- category_data"
142
-
143
- text = child.text.strip
144
-
145
- if last_pair.nil?
146
- ## assume its the very first entry; use implied/auto-created category
147
- last_pair = [ 'text', '' ]
148
- last_pair_data_count = 0
149
- pairs << last_pair
150
- end
151
-
152
- ### first category_data element?
153
- if last_pair_data_count == 0
154
- if last_pair[1] == ''
155
- last_pair[1] = text
156
- else
157
- last_pair[1] += " #{text}" ## append w/o separator
158
- end
159
- else
160
- if last_cat == 'demographic_profile' || last_cat == 'Demographic profile' ## special case (use space a sep)
161
- last_pair[1] += " #{text}" ## append with separator
162
- else
163
- last_pair[1] += "; #{text}" ## append with separator
164
- end
165
- end
166
- last_pair_data_count += 1
167
-
168
- else
169
- logger.warn " **** !!! skipping div w/o category or category_data class:"
170
- logger.warn child.to_s
171
- end
172
- end
173
-
174
- ## pp pairs
175
-
176
- ## pairs to hash
177
- pairs_hash = {}
178
- pairs.each do |pair|
179
- pairs_hash[ pair[0] ] = pair[1]
180
- end
181
-
182
- hash[ last_cat ] = pairs_hash
183
-
184
- else
185
- logger.warn "#### !!!! unknown cell type (no field or data id found):"
186
- logger.warn cell.to_s
187
- end
188
- end # each cell
189
-
190
- hash # return hash
191
-
192
- end # method sect_to_hash
193
-
194
- end # class Sect
195
-
196
- end # module Factbook
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Sect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :subsects
11
+
12
+ def initialize
13
+ @subsects = []
14
+ end
15
+
16
+ def data
17
+ ## convert sects to hash
18
+ @data = {}
19
+
20
+ subsects.each_with_index do |subsect,i|
21
+ @data[ subsect.title ] = subsect.data
22
+ end
23
+ @data
24
+ end
25
+
26
+
27
+ end # class Sect
28
+
29
+ end # module Factbook
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Subsect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
+
12
+ def initialize
13
+ @data = {}
14
+ end
15
+
16
+ end # class Subsect
17
+
18
+ end # module Factbook
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ##
6
+ ## make more "generic" - why? why not?
7
+ ## (re)use for other files ?? move to textutils ??
8
+
9
+ ##
10
+ ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
+ ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
+ ## 1 China 1,367,485,388
13
+ ## 2 India 1,251,695,584
14
+ ## 3 European Union 513,949,445
15
+ ## 4 United States 321,368,864
16
+ ## 5 Indonesia 255,993,674
17
+ ## 6 Brazil 204,259,812
18
+
19
+
20
+ class TableReader
21
+ include LogUtils::Logging
22
+
23
+
24
+ def initialize( text )
25
+ @text = text
26
+ end
27
+
28
+ def read
29
+ recs = []
30
+
31
+ line_no = 0
32
+ @text.each_line do |line|
33
+ line_no +=1
34
+ line = line.strip ## remove leading and trailing whitespace
35
+ if line.empty?
36
+ puts "** skipping empty line #{line_no}"
37
+ next
38
+ end
39
+
40
+ values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
+
42
+ ## puts line
43
+ ## pp values
44
+ recs << values
45
+ end
46
+ recs
47
+ end
48
+
49
+
50
+ end # class TableReader
51
+
52
+ end # module Factbook
@@ -0,0 +1,85 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module Utils
5
+
6
+ ########################################
7
+ ## todo: move to textutils - why, why not ?????
8
+
9
+ def encode_utf8( text )
10
+
11
+ errors = [] ## also return list of encoding errors
12
+
13
+ ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
+ ## encoding is likley wester/windows-
15
+
16
+ ## note:
17
+ ## use � - unknown/invalid unicode char
18
+ ## fix/todo: use ASCII-8BIT instead of binnary
19
+ text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
+ :undef => :replace,
21
+ :replace => '�' )
22
+
23
+ ## check for replaced/invalid chars and log warrning
24
+ pos = text.index( '�' )
25
+ while pos
26
+ from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
+ to = pos+10
28
+ around = text[from..to]
29
+ puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
+ msg = "invalid char on pos #{pos} around: >#{around}<"
31
+ puts msg
32
+ ## also log message / w timestamp
33
+
34
+ errors << "#{Time.now} - #{msg}"
35
+
36
+ pos = text.index( '�', pos+1 )
37
+ end
38
+
39
+ [text,errors] ## return text and errors (list)
40
+ end
41
+
42
+
43
+
44
+ def values_to_csv( values )
45
+ buf = ""
46
+ values.each_with_index do |value,i|
47
+ buf << ',' if i > 0 ## add comma (except for first value)
48
+ ## note: allow optional $ sign e.g. $100,000,000
49
+ ## !!!! todo/fix: allow optional minus e.g. -44,000
50
+ if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
+ ## check if number e.g. 17,098,242 or $17,098,242
52
+ ## remove commas 17098242
53
+ buf << value.gsub( ',', '' )
54
+ elsif value.index( ',').nil?
55
+ ## add as is 1:1 (no commana)
56
+ buf << value
57
+ else
58
+ ## escape comma with double quote
59
+ # e.g. Guam, The becomes "Guam, The"
60
+ buf << '"'
61
+ buf << value
62
+ buf << '"'
63
+ end
64
+ end
65
+ buf
66
+ end
67
+
68
+
69
+ def data_to_csv( recs, headers )
70
+ text = ""
71
+
72
+ text << values_to_csv( headers )
73
+ text << "\n"
74
+
75
+ recs.each do |rec|
76
+ text << values_to_csv( rec )
77
+ text << "\n"
78
+ end
79
+
80
+ text
81
+ end
82
+
83
+
84
+ end # module Utils
85
+ end # module Factbook
@@ -0,0 +1,102 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module Utils
5
+
6
+ #######
7
+ ## find meta data (about page info)
8
+
9
+
10
+ #### e.g. Page last updated on September 16, 2015
11
+
12
+ MONTH_EN_TO_S={
13
+ 'January' => '1',
14
+ 'February' => '2',
15
+ 'March' => '3',
16
+ 'April' => '4',
17
+ 'May' => '5',
18
+ 'June' => '6',
19
+ 'July' => '7',
20
+ 'August' => '8',
21
+ 'September' => '9',
22
+ 'October' => '10',
23
+ 'November' => '11',
24
+ 'December' => '12'
25
+ }
26
+
27
+ PAGE_LAST_UPDATED_REGEX = /
28
+ Page \s last \s updated \s on \s
29
+ (?<month_en>[a-z]+) \s
30
+ (?<day>\d{1,2}), \s
31
+ (?<year>\d{4})
32
+ /imx
33
+
34
+ def find_page_last_updated( html )
35
+ m = PAGE_LAST_UPDATED_REGEX.match( html )
36
+ if m
37
+ pp m
38
+ month_en = m[:month_en]
39
+ day = m[:day]
40
+ year = m[:year]
41
+ puts "** bingo - month #{month_en}, day #{day}, year #{year}"
42
+
43
+ month = MONTH_EN_TO_S[ month_en ]
44
+ date_str = "#{year}-#{month}-#{day}"
45
+ pp date_str
46
+ date = Date.strptime( date_str, '%Y-%m-%d' )
47
+ date
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ ##
54
+ ## e.g. regioncode="eur"
55
+ ## countrycode="au"
56
+ ## countryname="Austria"
57
+ ## flagsubfield=""
58
+ ## countryaffiliation=""
59
+ ## flagdescription=""
60
+ ## flagdescriptionnote=""
61
+ ## region="Europe"
62
+ ##
63
+ ## note: countryaffiliation may be empty
64
+
65
+
66
+
67
+ PAGE_INFO_REGEX = /
68
+ regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
69
+ \s+
70
+ countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
71
+ \s+
72
+ countryname=(?<q3>"|')(?<country>.+?)\k<q3>
73
+ \s+
74
+ [^>]+? ## allow any attribs (note: non-greedy)
75
+ countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
76
+ \s+
77
+ [^>]+? ## allow any attribs (note: non-greedy)
78
+ region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
79
+ /imx
80
+
81
+
82
+ def find_page_info( html )
83
+ m = PAGE_INFO_REGEX.match( html )
84
+ if m
85
+ pp m
86
+
87
+ h = { country_code: m[:country_code],
88
+ country_name: m[:country],
89
+ country_affiliation: m[:affiliation],
90
+ region_code: m[:region_code],
91
+ region_name: m[:region] }
92
+
93
+ puts "** bingo - #{h.inspect}"
94
+ h ## return hash w/ name-value pairs
95
+ else
96
+ nil ## or return empty struct with nils/empty strings - why?? why not??
97
+ end
98
+ end
99
+
100
+
101
+ end # module Utils
102
+ end # module Factbook
@@ -1,9 +1,10 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module Factbook
3
4
 
4
- MAJOR = 0
5
- MINOR = 1
6
- PATCH = 3
5
+ MAJOR = 1
6
+ MINOR = 0
7
+ PATCH = 0
7
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
9
 
9
10
  def self.version
data/lib/factbook.rb CHANGED
@@ -7,6 +7,7 @@ require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
9
  require 'json'
10
+ require 'csv'
10
11
  require 'fileutils'
11
12
 
12
13
 
@@ -21,11 +22,32 @@ require 'nokogiri'
21
22
  # our own code
22
23
 
23
24
  require 'factbook/version' # let it always go first
25
+ require 'factbook/utils'
26
+ require 'factbook/utils_info'
27
+ require 'factbook/sanitizer'
28
+ require 'factbook/builder_item'
29
+ require 'factbook/builder'
24
30
  require 'factbook/page'
25
31
  require 'factbook/sect'
32
+ require 'factbook/subsect'
26
33
 
34
+ require 'factbook/codes'
35
+ require 'factbook/comparisons'
27
36
 
37
+ require 'factbook/table' ## e.g. TableReader
28
38
 
29
39
 
30
- puts Factbook.banner
31
40
 
41
+ module Factbook
42
+
43
+ ## auto-load builtin codes and comparisons
44
+ CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
45
+ COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv")
46
+
47
+ def self.codes() CODES; end
48
+ def self.comparisons() COMPARISONS; end
49
+
50
+ end # module Factbook
51
+
52
+
53
+ puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG