factbook 1.2.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,75 +1,68 @@
1
- # encoding: utf-8
2
-
3
- ## stdlibs
4
-
5
- require 'net/http'
6
- require 'net/https' ## note: cia factbook requires https
7
- require 'uri'
8
- require 'cgi'
9
- require 'pp'
10
- require 'json'
11
- require 'csv'
12
- require 'fileutils'
13
- require 'erb' ## used by Almanac class (for render)
14
-
15
-
16
- ## 3rd party gems/libs
17
- ## require 'props'
18
-
19
- require 'logutils'
20
- require 'fetcher'
21
- require 'nokogiri'
22
-
23
- require 'active_record' ## add activerecord/db support (NOT optional for now)
24
-
25
-
26
- # our own code
27
-
28
- require 'factbook/version' # let it always go first
29
-
30
-
31
- require 'factbook/codes'
32
- require 'factbook/comparisons'
33
- require 'factbook/attributes'
34
-
35
- module Factbook
36
-
37
- ## auto-load builtin codes, comparisons, attributes, etc.
38
- CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
39
- COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
40
- ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
41
-
42
- def self.codes() CODES; end
43
- def self.comparisons() COMPARISONS; end
44
- def self.attributes() ATTRIBUTES; end
45
-
46
- end # module Factbook
47
-
48
- ## note: make codes, comparisons, attributes available
49
-
50
- require 'factbook/utils'
51
- require 'factbook/utils_info'
52
- require 'factbook/sanitizer'
53
- require 'factbook/normalize'
54
- require 'factbook/builder_item'
55
- require 'factbook/builder'
56
- require 'factbook/builder_json'
57
- require 'factbook/page'
58
- require 'factbook/page_info'
59
- require 'factbook/sect'
60
- require 'factbook/subsect'
61
-
62
- require 'factbook/reader_json'
63
- require 'factbook/almanac'
64
-
65
- require 'factbook/table' ## e.g. TableReader
66
-
67
- require 'factbook/counter'
68
-
69
- require 'factbook/db/schema' ## database (sql tables) support
70
- require 'factbook/db/models'
71
- require 'factbook/db/importer'
72
-
73
-
74
-
75
- puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
1
+ ## stdlibs
2
+
3
+
4
+ require 'cgi'
5
+ require 'csv' ## fix: use csvreader!!!!
6
+ require 'erb' ## used by Almanac class (for render)
7
+
8
+
9
+ ## 3rd party gems/libs
10
+ ## require 'props'
11
+
12
+ require 'logutils'
13
+ require 'webget'
14
+ require 'nokogiri'
15
+
16
+ require 'active_record' ## add activerecord/db support (NOT optional for now)
17
+
18
+
19
+
20
+ # our own code
21
+ require 'factbook/version' # let it always go first
22
+
23
+
24
+ require 'factbook/codes'
25
+ require 'factbook/comparisons'
26
+ require 'factbook/attributes'
27
+
28
+ module Factbook
29
+
30
+ ## auto-load builtin codes, comparisons, attributes, etc.
31
+ CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
32
+ COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
33
+ ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
34
+
35
+ def self.codes() CODES; end
36
+ def self.comparisons() COMPARISONS; end
37
+ def self.attributes() ATTRIBUTES; end
38
+
39
+ end # module Factbook
40
+
41
+ ## note: make codes, comparisons, attributes available
42
+
43
+ require 'factbook/utils'
44
+ require 'factbook/utils_info'
45
+ require 'factbook/sanitizer'
46
+ require 'factbook/normalize'
47
+ require 'factbook/builder_item'
48
+ require 'factbook/builder'
49
+ require 'factbook/builder_json'
50
+ require 'factbook/page'
51
+ require 'factbook/page_info'
52
+ require 'factbook/sect'
53
+ require 'factbook/subsect'
54
+
55
+ require 'factbook/reader_json'
56
+ require 'factbook/almanac'
57
+
58
+ require 'factbook/table' ## e.g. TableReader
59
+
60
+ require 'factbook/counter'
61
+
62
+ require 'factbook/db/schema' ## database (sql tables) support
63
+ require 'factbook/db/models'
64
+ require 'factbook/db/importer'
65
+
66
+
67
+
68
+ puts Factbook.banner
@@ -40,12 +40,23 @@ def initialize( html_ascii )
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
41
  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
43
- @html_debug = map_sects( @html )
44
- @html_debug = map_subsects( @html_debug )
45
43
 
46
- html_sects = split_sects( @html_debug )
44
+ html_sects = if @html.empty?
45
+ ## note: support "empty" pages - old format waiting for update!!!
46
+ ## cannot parse for now
47
+ [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
+ else
49
+ @html_debug = map_sects( @html )
50
+ @html_debug = map_subsects( @html_debug )
51
+
52
+ split_sects( @html_debug )
53
+ end
54
+
47
55
  pp html_sects
48
56
 
57
+ ## debug
58
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
+
49
60
 
50
61
  @sects = []
51
62
  html_sects.each do |html_sect|
@@ -5,88 +5,122 @@ module Factbook
5
5
  class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
6
  include LogUtils::Logging
7
7
  include NormalizeHelper ## e.g. normalize_category
8
-
8
+
9
9
  def initialize( html, name )
10
10
  @html = html
11
11
  @name = name # add category/field name e.g. Area, Location, etc.
12
12
  end
13
-
13
+
14
+
15
+
16
+ ##
17
+ ## <div class="category_data subfield text">
18
+ ## Portuguese (official and most widely spoken language)
19
+ ##
20
+ ## </div>
21
+ ## <div class="category_data note">
22
+ ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
+ ## </div>
24
+
25
+
14
26
  def read
15
27
  ## return hash from html snippet
16
28
  doc = Nokogiri::HTML.fragment( @html )
17
29
 
18
30
  data = {}
19
- last_node = nil ## track last hash (always use text key)
20
- last_node_data_count = 0
21
31
 
22
32
  ## note:
23
33
  ## skip whitespace text nodes (e.g. \n\n etc); just use divs
24
- doc.children.filter('div').each_with_index do |child,i|
25
-
26
- if child['class'] == 'category_data'
27
- text = child.text ## fix/todo: use strip
28
- puts "category_data: >#{text}<"
29
-
30
- if last_node.nil?
31
- ## assume its the very first entry; use implied/auto-created category
32
- data['text'] = ''
33
- last_node = data
34
- last_node_data_count = 0
35
- end
36
-
37
- ### first category_data element?
38
- if last_node_data_count == 0
39
- if last_node['text'] == ''
40
- last_node['text'] = text
41
- else ### possible ??? if data_count is zero - not should not include any data
42
- ## todo: issue warning here - why? why not??
43
- last_node['text'] += " #{text}" ## append w/o separator
44
- end
45
- else
46
- if @name == 'Demographic profile' ## special case (use space a sep)
47
- last_node['text'] += " #{text}" ## append without (w/o) separator
34
+ doc_children = doc.children.filter('div')
35
+
36
+ puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
+
38
+ doc_children.each_with_index do |div,i|
39
+ if div['class'].index( 'note' )
40
+ text = squish( div.text.strip )
41
+ puts "category_data: >#{text}<"
42
+
43
+ data['note'] = { 'text' => text }
44
+ elsif div['class'].index( 'historic' )
45
+ ## add all historic together into one for now
46
+ text = squish( div.text.strip )
47
+ puts "category_data: >#{text}<"
48
+
49
+ if i == 0
50
+ data['text'] = text
48
51
  else
49
- last_node['text'] += " ++ #{text}" ## append with ++ separator
52
+ ## append with / for now
53
+ data['text'] += " / #{text}"
50
54
  end
51
- end
52
- last_node_data_count += 1
55
+ elsif div.css( 'span.subfield-name').empty?
56
+ ## assume "implied text field"
57
+ ## check for index == 1 / child count == 1 - why? why not
58
+ text = squish( div.text.strip ) ## fix/todo: use strip
59
+ puts "category_data: >#{text}<"
53
60
 
54
- elsif child['class'].nil? ## div without any class e.g. <div>..</div>
55
- ## assume category and category_data pair w/ spans
56
- spans = child.children.filter('span')
57
- if spans.size > 2
58
- puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
61
+ data['text'] = text
62
+
63
+ ## must be always first node for now
64
+ if i != 0
65
+ puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
66
+ puts @html
67
+ exit 1
59
68
  end
60
-
61
- ## pp spans
62
-
63
- span_key = spans[0] ## assume 1st entry is span.category
64
- span_value = spans[1] ## assume 2nd entry is span.category_data
65
-
66
- key = normalize_category( span_key.text )
67
-
68
- ## note: allow optional category_data for now
69
- value = span_value ? span_value.text : nil
70
-
71
- puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
72
-
73
- ## start new pair
74
- last_node = data[key] = { 'text' => value }
75
- last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
69
+ elsif div['class'].index( 'grouped_subfield' )
70
+ ## split grouped subfield!!
71
+ ## <span class="subfield-name">arable land:</span>
72
+ ## <span class="subfield-number">8.6%</span>
73
+ ## <span class="subfield-date">(2011 est.)</span>
74
+ ## /
75
+ ## <span class="subfield-name">permanent crops:</span>
76
+ ## <span class="subfield-number">0.8%</span>
77
+ ## <span class="subfield-date">(2011 est.)</span>
78
+ ## /
79
+ ## <span class="subfield-name">permanent pasture:</span>
80
+ ## <span class="subfield-number">23.5%</span>
81
+ ## <span class="subfield-date">(2011 est.)</span>
82
+
83
+ ## join names for now - why? why not?
84
+ ## e.g. becomes:
85
+ ## arable land / permanent crops / permanent pasture: for key ??
86
+ span_names = div.css( 'span.subfield-name')
87
+ keys = []
88
+ span_names.each do |span|
89
+ keys << normalize_category( span.text.strip )
90
+ span.replace( '' )
91
+ end
92
+ key = keys.join( ' / ')
93
+ text = squish( div.text.strip )
94
+ puts "category_data key >#{key}<: >#{text}<"
95
+ data[ key ] = { 'text' => text }
76
96
  else
77
- puts "*** warn: item builder -- unknow css class in #{child.inspect}"
97
+ ## get subfield name
98
+ span_names = div.css( 'span.subfield-name')
99
+ if span_names.size > 1
100
+ puts "!! ERROR - found more than one subfield-name:"
101
+ puts div.to_html
102
+ exit 1
103
+ end
104
+ key = normalize_category( span_names[0].text.strip )
105
+ span_names[0].replace( '' )
106
+
107
+ text = squish( div.text.strip )
108
+ puts "category_data key >#{key}<: >#{text}<"
109
+ data[ key ] = { 'text' => text }
78
110
  end
79
-
80
- ## pp child
81
- ## css = child['class']
82
- ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
83
111
  end
84
-
112
+
113
+
85
114
  pp data
86
115
  data
87
116
  end
88
117
 
89
-
118
+
119
+
120
+ def squish( str )
121
+ str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
122
+ end
123
+
90
124
  end # class ItemBuilder
91
125
 
92
126
  end # module Factbook
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -38,10 +37,10 @@ class Page
38
37
 
39
38
  def initialize( code, opts={} )
40
39
  ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
-
40
+
42
41
  if opts[:json]
43
42
  json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
- b = JsonBuilder.from_string( json )
43
+ b = JsonBuilder.from_string( json )
45
44
  else ## assume html
46
45
  if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
46
  ## for debugging and testing allow "custom" passed-in html page
@@ -49,11 +48,13 @@ class Page
49
48
  else
50
49
  url_string = SITE_BASE.gsub( '{code}', code )
51
50
  ## note: expects ASCII-7BIT/BINARY encoding
52
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- end
51
+
52
+ ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ html = Webcache.read( url_string )
54
+ end
54
55
  b = Builder.from_string( html )
55
56
  end
56
-
57
+
57
58
  @sects = b.sects
58
59
  @info = b.info
59
60
 
@@ -65,7 +66,7 @@ class Page
65
66
  @info = info
66
67
  end
67
68
 
68
- @data = {}
69
+ @data = {}
69
70
  @sects.each do |sect|
70
71
  @data[ sect.title ] = sect.data
71
72
  end
@@ -78,7 +79,7 @@ class Page
78
79
  if opts[:minify]
79
80
  data.to_json
80
81
  else
81
- ## was: -- opts[:pretty] || opts[:pp]
82
+ ## was: -- opts[:pretty] || opts[:pp]
82
83
  JSON.pretty_generate( data ) ## note: pretty print by default!
83
84
  end
84
85
  end
@@ -96,10 +97,10 @@ class Page
96
97
  end
97
98
 
98
99
  ## add convenience (shortcut) accessors / attributes / fields / getters
99
-
100
+
100
101
  ATTRIBUTES.each do |attrib|
101
102
  ## e.g.
102
- ## def background() data['Introduction']['Background']['text']; end
103
+ ## def background() data['Introduction']['Background']['text']; end
103
104
  ## def location() data['Geography']['Location']['text']; end
104
105
  ## etc.
105
106
  if attrib.path.size == 1
@@ -114,31 +115,18 @@ class Page
114
115
  fetch( attrib.path[1], {} )['text']
115
116
  end
116
117
  end
117
- end
118
+ end
118
119
 
119
120
 
120
121
  private
121
- def fetch_page( url_string )
122
-
123
- worker = Fetcher::Worker.new
124
- response = worker.get_response( url_string )
125
-
126
- if response.code == '200'
127
- t = response.body
128
- ###
129
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
130
- # will mostly be ASCII
131
- # - try to change encoding to UTF-8 ourselves
132
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
133
- #####
134
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
135
- t
136
- else
137
- logger.error "fetch HTTP - #{response.code} #{response.message}"
138
- ## todo/fix: raise http exception (see fetcher) -- why? why not??
139
- fail "fetch HTTP - #{response.code} #{response.message}"
140
- nil
141
- end
122
+ def fetch_page( url )
123
+ response = Webget.page( url )
124
+
125
+ ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
+ exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
+
128
+
129
+ response.text
142
130
  end
143
131
 
144
132
 
@@ -157,29 +145,4 @@ end
157
145
 
158
146
 
159
147
  end # class Page
160
-
161
-
162
- =begin
163
- class PageFetcher
164
-
165
- def fetch( cc )
166
- worker = Fetcher::Worker.new
167
- factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
168
-
169
- res = worker.get_response( "#{factbook_base}/#{cc}.html" )
170
-
171
- # on error throw exception - why? why not??
172
- if res.code != '200'
173
- raise Fetcher::HttpError.new( res.code, res.message )
174
- end
175
-
176
- ###
177
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
178
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
179
- html = res.body.to_s
180
- end
181
- end # PageFetcher
182
- =end
183
-
184
-
185
148
  end # module Factbook