factbook 1.2.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,75 +1,68 @@
1
- # encoding: utf-8
2
-
3
- ## stdlibs
4
-
5
- require 'net/http'
6
- require 'net/https' ## note: cia factbook requires https
7
- require 'uri'
8
- require 'cgi'
9
- require 'pp'
10
- require 'json'
11
- require 'csv'
12
- require 'fileutils'
13
- require 'erb' ## used by Almanac class (for render)
14
-
15
-
16
- ## 3rd party gems/libs
17
- ## require 'props'
18
-
19
- require 'logutils'
20
- require 'fetcher'
21
- require 'nokogiri'
22
-
23
- require 'active_record' ## add activerecord/db support (NOT optional for now)
24
-
25
-
26
- # our own code
27
-
28
- require 'factbook/version' # let it always go first
29
-
30
-
31
- require 'factbook/codes'
32
- require 'factbook/comparisons'
33
- require 'factbook/attributes'
34
-
35
- module Factbook
36
-
37
- ## auto-load builtin codes, comparisons, attributes, etc.
38
- CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
39
- COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
40
- ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
41
-
42
- def self.codes() CODES; end
43
- def self.comparisons() COMPARISONS; end
44
- def self.attributes() ATTRIBUTES; end
45
-
46
- end # module Factbook
47
-
48
- ## note: make codes, comparisons, attributes available
49
-
50
- require 'factbook/utils'
51
- require 'factbook/utils_info'
52
- require 'factbook/sanitizer'
53
- require 'factbook/normalize'
54
- require 'factbook/builder_item'
55
- require 'factbook/builder'
56
- require 'factbook/builder_json'
57
- require 'factbook/page'
58
- require 'factbook/page_info'
59
- require 'factbook/sect'
60
- require 'factbook/subsect'
61
-
62
- require 'factbook/reader_json'
63
- require 'factbook/almanac'
64
-
65
- require 'factbook/table' ## e.g. TableReader
66
-
67
- require 'factbook/counter'
68
-
69
- require 'factbook/db/schema' ## database (sql tables) support
70
- require 'factbook/db/models'
71
- require 'factbook/db/importer'
72
-
73
-
74
-
75
- puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
1
+ ## stdlibs
2
+
3
+
4
+ require 'cgi'
5
+ require 'csv' ## fix: use csvreader!!!!
6
+ require 'erb' ## used by Almanac class (for render)
7
+
8
+
9
+ ## 3rd party gems/libs
10
+ ## require 'props'
11
+
12
+ require 'logutils'
13
+ require 'webget'
14
+ require 'nokogiri'
15
+
16
+ require 'active_record' ## add activerecord/db support (NOT optional for now)
17
+
18
+
19
+
20
+ # our own code
21
+ require 'factbook/version' # let it always go first
22
+
23
+
24
+ require 'factbook/codes'
25
+ require 'factbook/comparisons'
26
+ require 'factbook/attributes'
27
+
28
+ module Factbook
29
+
30
+ ## auto-load builtin codes, comparisons, attributes, etc.
31
+ CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
32
+ COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
33
+ ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
34
+
35
+ def self.codes() CODES; end
36
+ def self.comparisons() COMPARISONS; end
37
+ def self.attributes() ATTRIBUTES; end
38
+
39
+ end # module Factbook
40
+
41
+ ## note: make codes, comparisons, attributes available
42
+
43
+ require 'factbook/utils'
44
+ require 'factbook/utils_info'
45
+ require 'factbook/sanitizer'
46
+ require 'factbook/normalize'
47
+ require 'factbook/builder_item'
48
+ require 'factbook/builder'
49
+ require 'factbook/builder_json'
50
+ require 'factbook/page'
51
+ require 'factbook/page_info'
52
+ require 'factbook/sect'
53
+ require 'factbook/subsect'
54
+
55
+ require 'factbook/reader_json'
56
+ require 'factbook/almanac'
57
+
58
+ require 'factbook/table' ## e.g. TableReader
59
+
60
+ require 'factbook/counter'
61
+
62
+ require 'factbook/db/schema' ## database (sql tables) support
63
+ require 'factbook/db/models'
64
+ require 'factbook/db/importer'
65
+
66
+
67
+
68
+ puts Factbook.banner
@@ -40,12 +40,23 @@ def initialize( html_ascii )
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
41
  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
43
- @html_debug = map_sects( @html )
44
- @html_debug = map_subsects( @html_debug )
45
43
 
46
- html_sects = split_sects( @html_debug )
44
+ html_sects = if @html.empty?
45
+ ## note: support "empty" pages - old format waiting for update!!!
46
+ ## cannot parse for now
47
+ [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
+ else
49
+ @html_debug = map_sects( @html )
50
+ @html_debug = map_subsects( @html_debug )
51
+
52
+ split_sects( @html_debug )
53
+ end
54
+
47
55
  pp html_sects
48
56
 
57
+ ## debug
58
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
+
49
60
 
50
61
  @sects = []
51
62
  html_sects.each do |html_sect|
@@ -5,88 +5,122 @@ module Factbook
5
5
  class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
6
  include LogUtils::Logging
7
7
  include NormalizeHelper ## e.g. normalize_category
8
-
8
+
9
9
  def initialize( html, name )
10
10
  @html = html
11
11
  @name = name # add category/field name e.g. Area, Location, etc.
12
12
  end
13
-
13
+
14
+
15
+
16
+ ##
17
+ ## <div class="category_data subfield text">
18
+ ## Portuguese (official and most widely spoken language)
19
+ ##
20
+ ## </div>
21
+ ## <div class="category_data note">
22
+ ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
+ ## </div>
24
+
25
+
14
26
  def read
15
27
  ## return hash from html snippet
16
28
  doc = Nokogiri::HTML.fragment( @html )
17
29
 
18
30
  data = {}
19
- last_node = nil ## track last hash (always use text key)
20
- last_node_data_count = 0
21
31
 
22
32
  ## note:
23
33
  ## skip whitespace text nodes (e.g. \n\n etc); just use divs
24
- doc.children.filter('div').each_with_index do |child,i|
25
-
26
- if child['class'] == 'category_data'
27
- text = child.text ## fix/todo: use strip
28
- puts "category_data: >#{text}<"
29
-
30
- if last_node.nil?
31
- ## assume its the very first entry; use implied/auto-created category
32
- data['text'] = ''
33
- last_node = data
34
- last_node_data_count = 0
35
- end
36
-
37
- ### first category_data element?
38
- if last_node_data_count == 0
39
- if last_node['text'] == ''
40
- last_node['text'] = text
41
- else ### possible ??? if data_count is zero - not should not include any data
42
- ## todo: issue warning here - why? why not??
43
- last_node['text'] += " #{text}" ## append w/o separator
44
- end
45
- else
46
- if @name == 'Demographic profile' ## special case (use space a sep)
47
- last_node['text'] += " #{text}" ## append without (w/o) separator
34
+ doc_children = doc.children.filter('div')
35
+
36
+ puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
+
38
+ doc_children.each_with_index do |div,i|
39
+ if div['class'].index( 'note' )
40
+ text = squish( div.text.strip )
41
+ puts "category_data: >#{text}<"
42
+
43
+ data['note'] = { 'text' => text }
44
+ elsif div['class'].index( 'historic' )
45
+ ## add all historic together into one for now
46
+ text = squish( div.text.strip )
47
+ puts "category_data: >#{text}<"
48
+
49
+ if i == 0
50
+ data['text'] = text
48
51
  else
49
- last_node['text'] += " ++ #{text}" ## append with ++ separator
52
+ ## append with / for now
53
+ data['text'] += " / #{text}"
50
54
  end
51
- end
52
- last_node_data_count += 1
55
+ elsif div.css( 'span.subfield-name').empty?
56
+ ## assume "implied text field"
57
+ ## check for index == 1 / child count == 1 - why? why not
58
+ text = squish( div.text.strip ) ## fix/todo: use strip
59
+ puts "category_data: >#{text}<"
53
60
 
54
- elsif child['class'].nil? ## div without any class e.g. <div>..</div>
55
- ## assume category and category_data pair w/ spans
56
- spans = child.children.filter('span')
57
- if spans.size > 2
58
- puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
61
+ data['text'] = text
62
+
63
+ ## must be always first node for now
64
+ if i != 0
65
+ puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
66
+ puts @html
67
+ exit 1
59
68
  end
60
-
61
- ## pp spans
62
-
63
- span_key = spans[0] ## assume 1st entry is span.category
64
- span_value = spans[1] ## assume 2nd entry is span.category_data
65
-
66
- key = normalize_category( span_key.text )
67
-
68
- ## note: allow optional category_data for now
69
- value = span_value ? span_value.text : nil
70
-
71
- puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
72
-
73
- ## start new pair
74
- last_node = data[key] = { 'text' => value }
75
- last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
69
+ elsif div['class'].index( 'grouped_subfield' )
70
+ ## split grouped subfield!!
71
+ ## <span class="subfield-name">arable land:</span>
72
+ ## <span class="subfield-number">8.6%</span>
73
+ ## <span class="subfield-date">(2011 est.)</span>
74
+ ## /
75
+ ## <span class="subfield-name">permanent crops:</span>
76
+ ## <span class="subfield-number">0.8%</span>
77
+ ## <span class="subfield-date">(2011 est.)</span>
78
+ ## /
79
+ ## <span class="subfield-name">permanent pasture:</span>
80
+ ## <span class="subfield-number">23.5%</span>
81
+ ## <span class="subfield-date">(2011 est.)</span>
82
+
83
+ ## join names for now - why? why not?
84
+ ## e.g. becomes:
85
+ ## arable land / permanent crops / permanent pasture: for key ??
86
+ span_names = div.css( 'span.subfield-name')
87
+ keys = []
88
+ span_names.each do |span|
89
+ keys << normalize_category( span.text.strip )
90
+ span.replace( '' )
91
+ end
92
+ key = keys.join( ' / ')
93
+ text = squish( div.text.strip )
94
+ puts "category_data key >#{key}<: >#{text}<"
95
+ data[ key ] = { 'text' => text }
76
96
  else
77
- puts "*** warn: item builder -- unknow css class in #{child.inspect}"
97
+ ## get subfield name
98
+ span_names = div.css( 'span.subfield-name')
99
+ if span_names.size > 1
100
+ puts "!! ERROR - found more than one subfield-name:"
101
+ puts div.to_html
102
+ exit 1
103
+ end
104
+ key = normalize_category( span_names[0].text.strip )
105
+ span_names[0].replace( '' )
106
+
107
+ text = squish( div.text.strip )
108
+ puts "category_data key >#{key}<: >#{text}<"
109
+ data[ key ] = { 'text' => text }
78
110
  end
79
-
80
- ## pp child
81
- ## css = child['class']
82
- ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
83
111
  end
84
-
112
+
113
+
85
114
  pp data
86
115
  data
87
116
  end
88
117
 
89
-
118
+
119
+
120
+ def squish( str )
121
+ str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
122
+ end
123
+
90
124
  end # class ItemBuilder
91
125
 
92
126
  end # module Factbook
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -38,10 +37,10 @@ class Page
38
37
 
39
38
  def initialize( code, opts={} )
40
39
  ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
-
40
+
42
41
  if opts[:json]
43
42
  json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
- b = JsonBuilder.from_string( json )
43
+ b = JsonBuilder.from_string( json )
45
44
  else ## assume html
46
45
  if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
46
  ## for debugging and testing allow "custom" passed-in html page
@@ -49,11 +48,13 @@ class Page
49
48
  else
50
49
  url_string = SITE_BASE.gsub( '{code}', code )
51
50
  ## note: expects ASCII-7BIT/BINARY encoding
52
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- end
51
+
52
+ ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ html = Webcache.read( url_string )
54
+ end
54
55
  b = Builder.from_string( html )
55
56
  end
56
-
57
+
57
58
  @sects = b.sects
58
59
  @info = b.info
59
60
 
@@ -65,7 +66,7 @@ class Page
65
66
  @info = info
66
67
  end
67
68
 
68
- @data = {}
69
+ @data = {}
69
70
  @sects.each do |sect|
70
71
  @data[ sect.title ] = sect.data
71
72
  end
@@ -78,7 +79,7 @@ class Page
78
79
  if opts[:minify]
79
80
  data.to_json
80
81
  else
81
- ## was: -- opts[:pretty] || opts[:pp]
82
+ ## was: -- opts[:pretty] || opts[:pp]
82
83
  JSON.pretty_generate( data ) ## note: pretty print by default!
83
84
  end
84
85
  end
@@ -96,10 +97,10 @@ class Page
96
97
  end
97
98
 
98
99
  ## add convenience (shortcut) accessors / attributes / fields / getters
99
-
100
+
100
101
  ATTRIBUTES.each do |attrib|
101
102
  ## e.g.
102
- ## def background() data['Introduction']['Background']['text']; end
103
+ ## def background() data['Introduction']['Background']['text']; end
103
104
  ## def location() data['Geography']['Location']['text']; end
104
105
  ## etc.
105
106
  if attrib.path.size == 1
@@ -114,31 +115,18 @@ class Page
114
115
  fetch( attrib.path[1], {} )['text']
115
116
  end
116
117
  end
117
- end
118
+ end
118
119
 
119
120
 
120
121
  private
121
- def fetch_page( url_string )
122
-
123
- worker = Fetcher::Worker.new
124
- response = worker.get_response( url_string )
125
-
126
- if response.code == '200'
127
- t = response.body
128
- ###
129
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
130
- # will mostly be ASCII
131
- # - try to change encoding to UTF-8 ourselves
132
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
133
- #####
134
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
135
- t
136
- else
137
- logger.error "fetch HTTP - #{response.code} #{response.message}"
138
- ## todo/fix: raise http exception (see fetcher) -- why? why not??
139
- fail "fetch HTTP - #{response.code} #{response.message}"
140
- nil
141
- end
122
+ def fetch_page( url )
123
+ response = Webget.page( url )
124
+
125
+ ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
+ exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
+
128
+
129
+ response.text
142
130
  end
143
131
 
144
132
 
@@ -157,29 +145,4 @@ end
157
145
 
158
146
 
159
147
  end # class Page
160
-
161
-
162
- =begin
163
- class PageFetcher
164
-
165
- def fetch( cc )
166
- worker = Fetcher::Worker.new
167
- factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
168
-
169
- res = worker.get_response( "#{factbook_base}/#{cc}.html" )
170
-
171
- # on error throw exception - why? why not??
172
- if res.code != '200'
173
- raise Fetcher::HttpError.new( res.code, res.message )
174
- end
175
-
176
- ###
177
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
178
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
179
- html = res.body.to_s
180
- end
181
- end # PageFetcher
182
- =end
183
-
184
-
185
148
  end # module Factbook