factbook-readers 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
@@ -1,28 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- class Sect
6
- include LogUtils::Logging
7
-
8
- attr_accessor :title ## use name instead of title - why? why not?
9
- attr_accessor :subsects
10
-
11
- def initialize
12
- @subsects = []
13
- end
14
-
15
- def data
16
- ## convert sects to hash
17
- @data = {}
18
-
19
- subsects.each_with_index do |subsect,i|
20
- @data[ subsect.title ] = subsect.data
21
- end
22
- @data
23
- end
24
-
25
-
26
- end # class Sect
27
-
28
- end # module Factbook
@@ -1,17 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- class Subsect
6
- include LogUtils::Logging
7
-
8
- attr_accessor :title ## use name instead of title - why? why not?
9
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
10
-
11
- def initialize
12
- @data = {}
13
- end
14
-
15
- end # class Subsect
16
-
17
- end # module Factbook
@@ -1,51 +0,0 @@
1
-
2
- module Factbook
3
-
4
- ##
5
- ## make more "generic" - why? why not?
6
- ## (re)use for other files ?? move to textutils ??
7
-
8
- ##
9
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
10
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
11
- ## 1 China 1,367,485,388
12
- ## 2 India 1,251,695,584
13
- ## 3 European Union 513,949,445
14
- ## 4 United States 321,368,864
15
- ## 5 Indonesia 255,993,674
16
- ## 6 Brazil 204,259,812
17
-
18
-
19
- class TableReader
20
- include LogUtils::Logging
21
-
22
-
23
- def initialize( text )
24
- @text = text
25
- end
26
-
27
- def read
28
- recs = []
29
-
30
- line_no = 0
31
- @text.each_line do |line|
32
- line_no +=1
33
- line = line.strip ## remove leading and trailing whitespace
34
- if line.empty?
35
- puts "** skipping empty line #{line_no}"
36
- next
37
- end
38
-
39
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
40
-
41
- ## puts line
42
- ## pp values
43
- recs << values
44
- end
45
- recs
46
- end
47
-
48
-
49
- end # class TableReader
50
-
51
- end # module Factbook
@@ -1,47 +0,0 @@
1
-
2
- module Factbook
3
- module Utils
4
-
5
-
6
- def values_to_csv( values )
7
- buf = ""
8
- values.each_with_index do |value,i|
9
- buf << ',' if i > 0 ## add comma (except for first value)
10
- ## note: allow optional $ sign e.g. $100,000,000
11
- ## !!!! todo/fix: allow optional minus e.g. -44,000
12
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
13
- ## check if number e.g. 17,098,242 or $17,098,242
14
- ## remove commas 17098242
15
- buf << value.gsub( ',', '' )
16
- elsif value.index( ',').nil?
17
- ## add as is 1:1 (no commana)
18
- buf << value
19
- else
20
- ## escape comma with double quote
21
- # e.g. Guam, The becomes "Guam, The"
22
- buf << '"'
23
- buf << value
24
- buf << '"'
25
- end
26
- end
27
- buf
28
- end
29
-
30
-
31
- def data_to_csv( recs, headers )
32
- text = ""
33
-
34
- text << values_to_csv( headers )
35
- text << "\n"
36
-
37
- recs.each do |rec|
38
- text << values_to_csv( rec )
39
- text << "\n"
40
- end
41
-
42
- text
43
- end
44
-
45
-
46
- end # module Utils
47
- end # module Factbook
@@ -1,128 +0,0 @@
1
-
2
- module Factbook
3
- module Utils
4
-
5
- #######
6
- ## find meta data (about page info)
7
-
8
-
9
- #### e.g. Page last updated on September 16, 2015
10
-
11
- MONTH_EN_TO_S={
12
- 'January' => '1',
13
- 'February' => '2',
14
- 'March' => '3',
15
- 'April' => '4',
16
- 'May' => '5',
17
- 'June' => '6',
18
- 'July' => '7',
19
- 'August' => '8',
20
- 'September' => '9',
21
- 'October' => '10',
22
- 'November' => '11',
23
- 'December' => '12'
24
- }
25
-
26
-
27
-
28
- ##
29
- # examples (to match):
30
- # Page last updated on November 03, 2016
31
- # Page last updated on September 24, 2015
32
-
33
- PAGE_LAST_UPDATED_REGEX = /
34
- Page \s last \s updated \s on \s
35
- (?<month_en>[a-z]+) \s
36
- (?<day>\d{1,2}), \s
37
- (?<year>\d{4})
38
- /imx
39
-
40
- def find_page_last_updated( html )
41
- m = PAGE_LAST_UPDATED_REGEX.match( html )
42
- if m
43
- pp m
44
- month_en = m[:month_en]
45
- day = m[:day]
46
- year = m[:year]
47
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
48
-
49
- month = MONTH_EN_TO_S[ month_en ]
50
- date_str = "#{year}-#{month}-#{day}"
51
- pp date_str
52
- date = Date.strptime( date_str, '%Y-%m-%d' )
53
- date
54
- else
55
- nil
56
- end
57
- end
58
-
59
-
60
- ## fallback: find "standalone" country coude
61
- ## e.g.
62
- ## ccode='au'
63
-
64
- COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
65
-
66
- def find_country_code( html )
67
- m = COUNTRY_CODE_REGEX.match( html )
68
- if m
69
- pp m
70
- cc = m[:cc]
71
- puts "** bingo - country code #{cc}"
72
- cc
73
- else
74
- nil
75
- end
76
- end
77
-
78
-
79
- ##
80
- ## e.g. regioncode="eur"
81
- ## countrycode="au"
82
- ## countryname="Austria"
83
- ## flagsubfield=""
84
- ## countryaffiliation=""
85
- ## flagdescription=""
86
- ## flagdescriptionnote=""
87
- ## region="Europe"
88
- ##
89
- ## note: countryaffiliation may be empty
90
-
91
-
92
-
93
- PAGE_INFO_REGEX = /
94
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
95
- \s+
96
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
97
- \s+
98
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
99
- \s+
100
- [^>]+? ## allow any attribs (note: non-greedy)
101
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
102
- \s+
103
- [^>]+? ## allow any attribs (note: non-greedy)
104
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
105
- /imx
106
-
107
-
108
- def find_page_info( html )
109
- m = PAGE_INFO_REGEX.match( html )
110
- if m
111
- pp m
112
-
113
- h = { country_code: m[:country_code],
114
- country_name: m[:country],
115
- country_affiliation: m[:affiliation],
116
- region_code: m[:region_code],
117
- region_name: m[:region] }
118
-
119
- puts "** bingo - #{h.inspect}"
120
- h ## return hash w/ name-value pairs
121
- else
122
- nil ## or return empty struct with nils/empty strings - why?? why not??
123
- end
124
- end
125
-
126
-
127
- end # module Utils
128
- end # module Factbook
data/test/test_builder.rb DELETED
@@ -1,30 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_builder.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
-
10
- class TestBuilder < MiniTest::Test
11
-
12
- def test_build
13
-
14
- ['au','be'].each do |code|
15
- ## use/fix: ASCII-8BIT (e.g.keep as is) -???
16
- ## fix/todo: use ASCII8BIT/binary reader ??
17
- b = Factbook::Builder.from_file( "#{Factbook.root}/test/data/src/#{code}.html" )
18
- pp b.sects
19
-
20
- File.open( "./tmp/#{code}.debug.html", 'w' ) do |f|
21
- f.write b.html_debug
22
- end
23
- end
24
-
25
- assert true ## assume everthing ok
26
- end
27
-
28
-
29
- end # class TestBuilder
30
-
data/test/test_codes.rb DELETED
@@ -1,72 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_codes.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestCodes < MiniTest::Test
10
-
11
-
12
- def test_codes
13
- assert_equal 261, Factbook.codes.size
14
- assert_equal 261, Factbook.codes.to_a.size
15
-
16
-
17
- assert_equal 195, Factbook.codes.countries.size
18
- assert_equal 52, Factbook.codes.dependencies.size
19
- assert_equal 5, Factbook.codes.oceans.size
20
- assert_equal 1, Factbook.codes.world.size
21
- assert_equal 2, Factbook.codes.others.size
22
- assert_equal 6, Factbook.codes.misc.size
23
-
24
- assert_equal 8, Factbook.codes.dependencies_us.size
25
-
26
-
27
- assert_equal 55, Factbook.codes.europe.size
28
- assert_equal 9, Factbook.codes.south_asia.size
29
- assert_equal 6, Factbook.codes.central_asia.size
30
- assert_equal 22, Factbook.codes.east_n_souteast_asia.size
31
- assert_equal 19, Factbook.codes.middle_east.size
32
- assert_equal 56, Factbook.codes.africa.size
33
- assert_equal 7, Factbook.codes.north_america.size
34
- assert_equal 33, Factbook.codes.central_america_n_caribbean.size
35
- assert_equal 14, Factbook.codes.south_america.size
36
- assert_equal 30, Factbook.codes.australia_oceania.size
37
- assert_equal 4, Factbook.codes.antartica.size
38
- assert_equal 5, Factbook.codes.region('Oceans').size
39
- assert_equal 1, Factbook.codes.region('World').size
40
-
41
- assert_equal 45, Factbook.codes.countries.europe.size
42
-
43
- assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
44
- assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
45
-
46
-
47
- assert_equal 261, Factbook.codes.countries.size +
48
- Factbook.codes.others.size +
49
- Factbook.codes.dependencies.size +
50
- Factbook.codes.misc.size +
51
- Factbook.codes.oceans.size +
52
- Factbook.codes.world.size
53
-
54
- assert_equal 261, Factbook.codes.europe.size +
55
- Factbook.codes.south_asia.size +
56
- Factbook.codes.central_asia.size +
57
- Factbook.codes.east_n_souteast_asia.size +
58
- Factbook.codes.middle_east.size +
59
- Factbook.codes.africa.size +
60
- Factbook.codes.north_america.size +
61
- Factbook.codes.central_america_n_caribbean.size +
62
- Factbook.codes.south_america.size +
63
- Factbook.codes.australia_oceania.size +
64
- Factbook.codes.antartica.size +
65
- Factbook.codes.region('Oceans').size +
66
- Factbook.codes.region('World').size
67
-
68
- end
69
-
70
- end # class TestCodes
71
-
72
-
@@ -1,16 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_comparisons.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestComparisons < MiniTest::Test
10
-
11
- def test_comparisons
12
- assert_equal 74, Factbook.comparisons.size
13
- assert_equal 74, Factbook.comparisons.to_a.size
14
- end
15
-
16
- end # class TestComparisons
@@ -1,97 +0,0 @@
1
- ###
2
- # to run use
3
- # ruby -I ./lib -I ./test test/test_item_builder.rb
4
-
5
-
6
- require 'helper'
7
-
8
-
9
- class TestItemBuilder < MiniTest::Test
10
-
11
- def test_location
12
-
13
- html =<<EOS
14
- <div class=category_data>Central Europe, north of Italy and Slovenia</div>
15
- EOS
16
-
17
- b = Factbook::ItemBuilder.new( html, 'Location' )
18
- b.read
19
-
20
- assert true ## assume everthing ok
21
- end
22
-
23
- def test_area
24
- html =<<EOS
25
- <div><span class=category>total: </span><span class=category_data>83,871 sq km</span></div>
26
- <div><span class=category>land: </span><span class=category_data>82,445 sq km</span></div>
27
- <div><span class=category>water: </span><span class=category_data>1,426 sq km</span></div>
28
- EOS
29
-
30
- b = Factbook::ItemBuilder.new( html, 'Area' )
31
- b.read
32
-
33
- assert true ## assume everthing ok
34
- end
35
-
36
- def test_land_use
37
- html =<<EOS
38
- <div><span class=category>agricultural land: </span><span class=category_data>38.4%</span></div>
39
- <div class=category_data>arable land 16.5%; permanent crops 0.8%; permanent pasture 21.1%</div>
40
- <div><span class=category>forest: </span><span class=category_data>47.2%</span></div>
41
- <div><span class=category>other: </span><span class=category_data>14.4% (2011 est.)</span></div>
42
- EOS
43
-
44
- b = Factbook::ItemBuilder.new( html, 'Land use' )
45
- b.read
46
-
47
- assert true ## assume everthing ok
48
- end
49
-
50
- def test_contraceptive_prevalence_rate
51
- html =<<EOS
52
- <div class=category_data>69.6%</div>
53
- <div><span class=category>note: </span><span class=category_data>percent of women aged 18-46 (2008/09)</span></div>
54
- EOS
55
-
56
- b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
57
- b.read
58
-
59
- assert true ## assume everthing ok
60
- end
61
-
62
- def test_drinking_water_source
63
- html =<<EOS
64
- <div><span class=category>improved: </span><span class=category_data></span></div>
65
- <div class=category_data>urban: 100% of population</div>
66
- <div class=category_data>rural: 100% of population</div>
67
- <div class=category_data>total: 100% of population</div>
68
- <div><span class=category>unimproved: </span><span class=category_data></span></div>
69
- <div class=category_data>urban: 0% of population</div>
70
- <div class=category_data>rural: 0% of population</div>
71
- <div class=category_data>total: 0% of population (2015 est.)</div>
72
- EOS
73
-
74
- b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
75
- b.read
76
-
77
- assert true ## assume everthing ok
78
- end
79
-
80
- def test_political_pressure_groups_and_leaders
81
- html =<<EOS
82
- <div class=category_data>Austrian Trade Union Federation or OeGB (nominally independent but primarily Social Democratic)</div>
83
- <div class=category_data>Federal Economic Chamber (OeVP-dominated)</div>
84
- <div class=category_data>Labor Chamber or AK (Social Democratic-leaning think tank)</div>
85
- <div class=category_data>OeVP-oriented Association of Austrian Industrialists or IV</div>
86
- <div class=category_data>Roman Catholic Church, including its chief lay organization, Catholic Action</div>
87
- <div><span class=category>other: </span><span class=category_data>three composite leagues of the Austrian People's Party or OeVP representing business, labor, farmers, and other nongovernment organizations in the areas of environment and human rights</span></div>
88
- EOS
89
-
90
- b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
91
- b.read
92
-
93
- assert true ## assume everthing ok
94
- end
95
-
96
- end # class TestItemBuilder
97
-