factbook-readers 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
@@ -1,28 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- class Sect
6
- include LogUtils::Logging
7
-
8
- attr_accessor :title ## use name instead of title - why? why not?
9
- attr_accessor :subsects
10
-
11
- def initialize
12
- @subsects = []
13
- end
14
-
15
- def data
16
- ## convert sects to hash
17
- @data = {}
18
-
19
- subsects.each_with_index do |subsect,i|
20
- @data[ subsect.title ] = subsect.data
21
- end
22
- @data
23
- end
24
-
25
-
26
- end # class Sect
27
-
28
- end # module Factbook
@@ -1,17 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- class Subsect
6
- include LogUtils::Logging
7
-
8
- attr_accessor :title ## use name instead of title - why? why not?
9
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
10
-
11
- def initialize
12
- @data = {}
13
- end
14
-
15
- end # class Subsect
16
-
17
- end # module Factbook
@@ -1,51 +0,0 @@
1
-
2
- module Factbook
3
-
4
- ##
5
- ## make more "generic" - why? why not?
6
- ## (re)use for other files ?? move to textutils ??
7
-
8
- ##
9
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
10
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
11
- ## 1 China 1,367,485,388
12
- ## 2 India 1,251,695,584
13
- ## 3 European Union 513,949,445
14
- ## 4 United States 321,368,864
15
- ## 5 Indonesia 255,993,674
16
- ## 6 Brazil 204,259,812
17
-
18
-
19
- class TableReader
20
- include LogUtils::Logging
21
-
22
-
23
- def initialize( text )
24
- @text = text
25
- end
26
-
27
- def read
28
- recs = []
29
-
30
- line_no = 0
31
- @text.each_line do |line|
32
- line_no +=1
33
- line = line.strip ## remove leading and trailing whitespace
34
- if line.empty?
35
- puts "** skipping empty line #{line_no}"
36
- next
37
- end
38
-
39
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
40
-
41
- ## puts line
42
- ## pp values
43
- recs << values
44
- end
45
- recs
46
- end
47
-
48
-
49
- end # class TableReader
50
-
51
- end # module Factbook
@@ -1,47 +0,0 @@
1
-
2
- module Factbook
3
- module Utils
4
-
5
-
6
- def values_to_csv( values )
7
- buf = ""
8
- values.each_with_index do |value,i|
9
- buf << ',' if i > 0 ## add comma (except for first value)
10
- ## note: allow optional $ sign e.g. $100,000,000
11
- ## !!!! todo/fix: allow optional minus e.g. -44,000
12
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
13
- ## check if number e.g. 17,098,242 or $17,098,242
14
- ## remove commas 17098242
15
- buf << value.gsub( ',', '' )
16
- elsif value.index( ',').nil?
17
- ## add as is 1:1 (no commana)
18
- buf << value
19
- else
20
- ## escape comma with double quote
21
- # e.g. Guam, The becomes "Guam, The"
22
- buf << '"'
23
- buf << value
24
- buf << '"'
25
- end
26
- end
27
- buf
28
- end
29
-
30
-
31
- def data_to_csv( recs, headers )
32
- text = ""
33
-
34
- text << values_to_csv( headers )
35
- text << "\n"
36
-
37
- recs.each do |rec|
38
- text << values_to_csv( rec )
39
- text << "\n"
40
- end
41
-
42
- text
43
- end
44
-
45
-
46
- end # module Utils
47
- end # module Factbook
@@ -1,128 +0,0 @@
1
-
2
- module Factbook
3
- module Utils
4
-
5
- #######
6
- ## find meta data (about page info)
7
-
8
-
9
- #### e.g. Page last updated on September 16, 2015
10
-
11
- MONTH_EN_TO_S={
12
- 'January' => '1',
13
- 'February' => '2',
14
- 'March' => '3',
15
- 'April' => '4',
16
- 'May' => '5',
17
- 'June' => '6',
18
- 'July' => '7',
19
- 'August' => '8',
20
- 'September' => '9',
21
- 'October' => '10',
22
- 'November' => '11',
23
- 'December' => '12'
24
- }
25
-
26
-
27
-
28
- ##
29
- # examples (to match):
30
- # Page last updated on November 03, 2016
31
- # Page last updated on September 24, 2015
32
-
33
- PAGE_LAST_UPDATED_REGEX = /
34
- Page \s last \s updated \s on \s
35
- (?<month_en>[a-z]+) \s
36
- (?<day>\d{1,2}), \s
37
- (?<year>\d{4})
38
- /imx
39
-
40
- def find_page_last_updated( html )
41
- m = PAGE_LAST_UPDATED_REGEX.match( html )
42
- if m
43
- pp m
44
- month_en = m[:month_en]
45
- day = m[:day]
46
- year = m[:year]
47
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
48
-
49
- month = MONTH_EN_TO_S[ month_en ]
50
- date_str = "#{year}-#{month}-#{day}"
51
- pp date_str
52
- date = Date.strptime( date_str, '%Y-%m-%d' )
53
- date
54
- else
55
- nil
56
- end
57
- end
58
-
59
-
60
- ## fallback: find "standalone" country coude
61
- ## e.g.
62
- ## ccode='au'
63
-
64
- COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
65
-
66
- def find_country_code( html )
67
- m = COUNTRY_CODE_REGEX.match( html )
68
- if m
69
- pp m
70
- cc = m[:cc]
71
- puts "** bingo - country code #{cc}"
72
- cc
73
- else
74
- nil
75
- end
76
- end
77
-
78
-
79
- ##
80
- ## e.g. regioncode="eur"
81
- ## countrycode="au"
82
- ## countryname="Austria"
83
- ## flagsubfield=""
84
- ## countryaffiliation=""
85
- ## flagdescription=""
86
- ## flagdescriptionnote=""
87
- ## region="Europe"
88
- ##
89
- ## note: countryaffiliation may be empty
90
-
91
-
92
-
93
- PAGE_INFO_REGEX = /
94
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
95
- \s+
96
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
97
- \s+
98
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
99
- \s+
100
- [^>]+? ## allow any attribs (note: non-greedy)
101
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
102
- \s+
103
- [^>]+? ## allow any attribs (note: non-greedy)
104
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
105
- /imx
106
-
107
-
108
- def find_page_info( html )
109
- m = PAGE_INFO_REGEX.match( html )
110
- if m
111
- pp m
112
-
113
- h = { country_code: m[:country_code],
114
- country_name: m[:country],
115
- country_affiliation: m[:affiliation],
116
- region_code: m[:region_code],
117
- region_name: m[:region] }
118
-
119
- puts "** bingo - #{h.inspect}"
120
- h ## return hash w/ name-value pairs
121
- else
122
- nil ## or return empty struct with nils/empty strings - why?? why not??
123
- end
124
- end
125
-
126
-
127
- end # module Utils
128
- end # module Factbook
data/test/test_builder.rb DELETED
@@ -1,30 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_builder.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
-
10
- class TestBuilder < MiniTest::Test
11
-
12
- def test_build
13
-
14
- ['au','be'].each do |code|
15
- ## use/fix: ASCII-8BIT (e.g.keep as is) -???
16
- ## fix/todo: use ASCII8BIT/binary reader ??
17
- b = Factbook::Builder.from_file( "#{Factbook.root}/test/data/src/#{code}.html" )
18
- pp b.sects
19
-
20
- File.open( "./tmp/#{code}.debug.html", 'w' ) do |f|
21
- f.write b.html_debug
22
- end
23
- end
24
-
25
- assert true ## assume everthing ok
26
- end
27
-
28
-
29
- end # class TestBuilder
30
-
data/test/test_codes.rb DELETED
@@ -1,72 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_codes.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestCodes < MiniTest::Test
10
-
11
-
12
- def test_codes
13
- assert_equal 261, Factbook.codes.size
14
- assert_equal 261, Factbook.codes.to_a.size
15
-
16
-
17
- assert_equal 195, Factbook.codes.countries.size
18
- assert_equal 52, Factbook.codes.dependencies.size
19
- assert_equal 5, Factbook.codes.oceans.size
20
- assert_equal 1, Factbook.codes.world.size
21
- assert_equal 2, Factbook.codes.others.size
22
- assert_equal 6, Factbook.codes.misc.size
23
-
24
- assert_equal 8, Factbook.codes.dependencies_us.size
25
-
26
-
27
- assert_equal 55, Factbook.codes.europe.size
28
- assert_equal 9, Factbook.codes.south_asia.size
29
- assert_equal 6, Factbook.codes.central_asia.size
30
- assert_equal 22, Factbook.codes.east_n_souteast_asia.size
31
- assert_equal 19, Factbook.codes.middle_east.size
32
- assert_equal 56, Factbook.codes.africa.size
33
- assert_equal 7, Factbook.codes.north_america.size
34
- assert_equal 33, Factbook.codes.central_america_n_caribbean.size
35
- assert_equal 14, Factbook.codes.south_america.size
36
- assert_equal 30, Factbook.codes.australia_oceania.size
37
- assert_equal 4, Factbook.codes.antartica.size
38
- assert_equal 5, Factbook.codes.region('Oceans').size
39
- assert_equal 1, Factbook.codes.region('World').size
40
-
41
- assert_equal 45, Factbook.codes.countries.europe.size
42
-
43
- assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
44
- assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
45
-
46
-
47
- assert_equal 261, Factbook.codes.countries.size +
48
- Factbook.codes.others.size +
49
- Factbook.codes.dependencies.size +
50
- Factbook.codes.misc.size +
51
- Factbook.codes.oceans.size +
52
- Factbook.codes.world.size
53
-
54
- assert_equal 261, Factbook.codes.europe.size +
55
- Factbook.codes.south_asia.size +
56
- Factbook.codes.central_asia.size +
57
- Factbook.codes.east_n_souteast_asia.size +
58
- Factbook.codes.middle_east.size +
59
- Factbook.codes.africa.size +
60
- Factbook.codes.north_america.size +
61
- Factbook.codes.central_america_n_caribbean.size +
62
- Factbook.codes.south_america.size +
63
- Factbook.codes.australia_oceania.size +
64
- Factbook.codes.antartica.size +
65
- Factbook.codes.region('Oceans').size +
66
- Factbook.codes.region('World').size
67
-
68
- end
69
-
70
- end # class TestCodes
71
-
72
-
@@ -1,16 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_comparisons.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestComparisons < MiniTest::Test
10
-
11
- def test_comparisons
12
- assert_equal 74, Factbook.comparisons.size
13
- assert_equal 74, Factbook.comparisons.to_a.size
14
- end
15
-
16
- end # class TestComparisons
@@ -1,97 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_item_builder.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestItemBuilder < MiniTest::Test
10
-
11
- def test_location
12
-
13
- html =<<EOS
14
- <div class=category_data>Central Europe, north of Italy and Slovenia</div>
15
- EOS
16
-
17
- b = Factbook::ItemBuilder.new( html, 'Location' )
18
- b.read
19
-
20
- assert true ## assume everthing ok
21
- end
22
-
23
- def test_area
24
- html =<<EOS
25
- <div><span class=category>total: </span><span class=category_data>83,871 sq km</span></div>
26
- <div><span class=category>land: </span><span class=category_data>82,445 sq km</span></div>
27
- <div><span class=category>water: </span><span class=category_data>1,426 sq km</span></div>
28
- EOS
29
-
30
- b = Factbook::ItemBuilder.new( html, 'Area' )
31
- b.read
32
-
33
- assert true ## assume everthing ok
34
- end
35
-
36
- def test_land_use
37
- html =<<EOS
38
- <div><span class=category>agricultural land: </span><span class=category_data>38.4%</span></div>
39
- <div class=category_data>arable land 16.5%; permanent crops 0.8%; permanent pasture 21.1%</div>
40
- <div><span class=category>forest: </span><span class=category_data>47.2%</span></div>
41
- <div><span class=category>other: </span><span class=category_data>14.4% (2011 est.)</span></div>
42
- EOS
43
-
44
- b = Factbook::ItemBuilder.new( html, 'Land use' )
45
- b.read
46
-
47
- assert true ## assume everthing ok
48
- end
49
-
50
- def test_contraceptive_prevalence_rate
51
- html =<<EOS
52
- <div class=category_data>69.6%</div>
53
- <div><span class=category>note: </span><span class=category_data>percent of women aged 18-46 (2008/09)</span></div>
54
- EOS
55
-
56
- b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
57
- b.read
58
-
59
- assert true ## assume everthing ok
60
- end
61
-
62
- def test_drinking_water_source
63
- html =<<EOS
64
- <div><span class=category>improved: </span><span class=category_data></span></div>
65
- <div class=category_data>urban: 100% of population</div>
66
- <div class=category_data>rural: 100% of population</div>
67
- <div class=category_data>total: 100% of population</div>
68
- <div><span class=category>unimproved: </span><span class=category_data></span></div>
69
- <div class=category_data>urban: 0% of population</div>
70
- <div class=category_data>rural: 0% of population</div>
71
- <div class=category_data>total: 0% of population (2015 est.)</div>
72
- EOS
73
-
74
- b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
75
- b.read
76
-
77
- assert true ## assume everthing ok
78
- end
79
-
80
- def test_political_pressure_groups_and_leaders
81
- html =<<EOS
82
- <div class=category_data>Austrian Trade Union Federation or OeGB (nominally independent but primarily Social Democratic)</div>
83
- <div class=category_data>Federal Economic Chamber (OeVP-dominated)</div>
84
- <div class=category_data>Labor Chamber or AK (Social Democratic-leaning think tank)</div>
85
- <div class=category_data>OeVP-oriented Association of Austrian Industrialists or IV</div>
86
- <div class=category_data>Roman Catholic Church, including its chief lay organization, Catholic Action</div>
87
- <div><span class=category>other: </span><span class=category_data>three composite leagues of the Austrian People's Party or OeVP representing business, labor, farmers, and other nongovernment organizations in the areas of environment and human rights</span></div>
88
- EOS
89
-
90
- b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
91
- b.read
92
-
93
- assert true ## assume everthing ok
94
- end
95
-
96
- end # class TestItemBuilder
97
-