datacatalog-importer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +0 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/datacatalog-importer.gemspec +65 -0
- data/lib/datacatalog-importer.rb +14 -0
- data/lib/importer.rb +7 -0
- data/lib/puller.rb +65 -0
- data/lib/pusher.rb +138 -0
- data/lib/shared.rb +17 -0
- data/lib/sort_yaml_hash.rb +22 -0
- data/lib/tasks.rb +36 -0
- data/lib/utility.rb +128 -0
- data/spec/datacatalog-importer_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +93 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Sunlight Labs
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "datacatalog-importer"
|
8
|
+
gem.summary = %Q{A framework to write National Data Catalog importers}
|
9
|
+
gem.description = %Q{This framework makes it easier to write importers for the National Data Catalog.}
|
10
|
+
gem.email = "djames@sunlightfoundation.com"
|
11
|
+
gem.homepage = "http://github.com/djsun/datacatalog-importer"
|
12
|
+
gem.authors = ["David James"]
|
13
|
+
gem.add_dependency "nokogiri", ">= 1.4.1"
|
14
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
15
|
+
# gem is a Gem::Specification...
|
16
|
+
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
# require 'rake/rdoctask'
|
24
|
+
# Rake::RDocTask.new do |rdoc|
|
25
|
+
# version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
26
|
+
#
|
27
|
+
# rdoc.rdoc_dir = 'rdoc'
|
28
|
+
# rdoc.title = "datacatalog-importer #{version}"
|
29
|
+
# rdoc.rdoc_files.include('README*')
|
30
|
+
# rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# require 'spec/rake/spectask'
|
34
|
+
# Spec::Rake::SpecTask.new(:spec) do |spec|
|
35
|
+
# spec.libs << 'lib' << 'spec'
|
36
|
+
# spec.spec_files = FileList['spec/**/*_spec.rb']
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
40
|
+
# spec.libs << 'lib' << 'spec'
|
41
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
42
|
+
# spec.rcov = true
|
43
|
+
# end
|
44
|
+
|
45
|
+
task :spec => :check_dependencies
|
46
|
+
|
47
|
+
task :default => :spec
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{datacatalog-importer}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["David James"]
|
12
|
+
s.date = %q{2010-02-05}
|
13
|
+
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
|
+
s.email = %q{djames@sunlightfoundation.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"datacatalog-importer.gemspec",
|
27
|
+
"lib/datacatalog-importer.rb",
|
28
|
+
"lib/importer.rb",
|
29
|
+
"lib/puller.rb",
|
30
|
+
"lib/pusher.rb",
|
31
|
+
"lib/shared.rb",
|
32
|
+
"lib/sort_yaml_hash.rb",
|
33
|
+
"lib/tasks.rb",
|
34
|
+
"lib/utility.rb",
|
35
|
+
"spec/datacatalog-importer_spec.rb",
|
36
|
+
"spec/spec.opts",
|
37
|
+
"spec/spec_helper.rb"
|
38
|
+
]
|
39
|
+
s.homepage = %q{http://github.com/djsun/datacatalog-importer}
|
40
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
41
|
+
s.require_paths = ["lib"]
|
42
|
+
s.rubygems_version = %q{1.3.5}
|
43
|
+
s.summary = %q{A framework to write National Data Catalog importers}
|
44
|
+
s.test_files = [
|
45
|
+
"spec/datacatalog-importer_spec.rb",
|
46
|
+
"spec/spec_helper.rb"
|
47
|
+
]
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
51
|
+
s.specification_version = 3
|
52
|
+
|
53
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
54
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
55
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
58
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
62
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# The following line is a workaround.
|
2
|
+
# See https://rails.lighthouseapp.com/projects/8994/tickets/2942-enumerableenumeratornext-causes-stack-level-too-deep-since-activesupport-233
|
3
|
+
#
|
4
|
+
# It allows Ruby 1.8.7's Enumerable#each to work properly. Otherwise,
|
5
|
+
# the 'active_support' gem loads the 'json' gem and there will be
|
6
|
+
# strange conflicts.
|
7
|
+
require 'generator.rb'
|
8
|
+
|
9
|
+
require File.dirname(__FILE__) + '/importer'
|
10
|
+
require File.dirname(__FILE__) + '/puller'
|
11
|
+
require File.dirname(__FILE__) + '/pusher'
|
12
|
+
require File.dirname(__FILE__) + '/sort_yaml_hash'
|
13
|
+
require File.dirname(__FILE__) + '/tasks'
|
14
|
+
require File.dirname(__FILE__) + '/utility'
|
data/lib/importer.rb
ADDED
data/lib/puller.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/shared'
|
2
|
+
|
3
|
+
module DataCatalog
|
4
|
+
module Importer
|
5
|
+
class Puller
|
6
|
+
include DataCatalog::Importer::Shared
|
7
|
+
|
8
|
+
REQUIRED = %w(cache_folder pullers)
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
REQUIRED.each do |r|
|
12
|
+
raise Error, "option :#{r} is required" unless options[r.intern]
|
13
|
+
end
|
14
|
+
@options = options
|
15
|
+
@counter = {
|
16
|
+
:source => 1,
|
17
|
+
:organization => 1,
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
Utility.report_timing "pull source" do
|
23
|
+
pull_resource(:source)
|
24
|
+
end
|
25
|
+
Utility.report_timing "pull organization" do
|
26
|
+
pull_resource(:organization)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
protected
|
31
|
+
|
32
|
+
# Note on HTTP Throttling
|
33
|
+
#
|
34
|
+
# It might make sense to throttle HTTP calls in
|
35
|
+
# * pull_organizations
|
36
|
+
# * pull_sources
|
37
|
+
#
|
38
|
+
# However, doing a simple sleep(TIME_DELAY) is too blunt.
|
39
|
+
# It makes sense when an HTTP call is made; however, it does
|
40
|
+
# not make sense when the importer uses a local cache.
|
41
|
+
#
|
42
|
+
# An alternative is to wrap HTTP calls in this Importer library.
|
43
|
+
# It could add a little bit of delay to HTTP calls that are made
|
44
|
+
# too rapidly.
|
45
|
+
#
|
46
|
+
def pull_resource(resource)
|
47
|
+
unless importer_class = @options[:pullers][resource]
|
48
|
+
raise Error, "options[:pullers][:#{r}] is required"
|
49
|
+
end
|
50
|
+
importer = importer_class.new
|
51
|
+
FileUtils.mkdir_p(folder(resource))
|
52
|
+
while (data = importer.fetch) do
|
53
|
+
write_data(resource, data)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def write_data(resource, data)
|
58
|
+
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
59
|
+
Utility.write_yaml(file, data)
|
60
|
+
@counter[resource] += 1
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/pusher.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'datacatalog'
|
2
|
+
require File.dirname(__FILE__) + '/shared'
|
3
|
+
|
4
|
+
module DataCatalog
|
5
|
+
module Importer
|
6
|
+
class Pusher
|
7
|
+
include DataCatalog::Importer::Shared
|
8
|
+
|
9
|
+
REQUIRED = %w(api_key base_uri cache_folder)
|
10
|
+
|
11
|
+
def initialize(options)
|
12
|
+
REQUIRED.each do |r|
|
13
|
+
raise Error, "option :#{r} is required" unless options[r.intern]
|
14
|
+
end
|
15
|
+
@options = options
|
16
|
+
end
|
17
|
+
|
18
|
+
def run
|
19
|
+
setup_api
|
20
|
+
Utility.report_timing "push organizations" do
|
21
|
+
push_organizations
|
22
|
+
end
|
23
|
+
Utility.report_timing "push sources" do
|
24
|
+
push_sources
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def setup_api
|
31
|
+
DataCatalog.api_key = @options[:api_key]
|
32
|
+
DataCatalog.base_uri = @options[:base_uri]
|
33
|
+
end
|
34
|
+
|
35
|
+
def push_organizations
|
36
|
+
read_data(:organization) do |data|
|
37
|
+
create_or_update_organization(data)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def push_sources
|
42
|
+
read_data(:source) do |data|
|
43
|
+
source = create_or_update_source(data)
|
44
|
+
data[:downloads].each do |download_data|
|
45
|
+
create_or_update_download(source, download_data)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# ---
|
51
|
+
|
52
|
+
def read_data(resource)
|
53
|
+
folder = folder(resource)
|
54
|
+
unless File.exist?(folder)
|
55
|
+
raise "Directory does not exist: #{folder}"
|
56
|
+
end
|
57
|
+
wildcard = folder + "/*.yml"
|
58
|
+
files = Dir.glob(wildcard)
|
59
|
+
if files.empty?
|
60
|
+
raise "No files found with: #{wildcard}"
|
61
|
+
end
|
62
|
+
files.sort.each do |f|
|
63
|
+
data = YAML::load_file(f)
|
64
|
+
yield data
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# ---
|
69
|
+
|
70
|
+
def create_or_update_organization(data)
|
71
|
+
docs = DataCatalog::Organization.all(:url => data[:url])
|
72
|
+
n = docs.length
|
73
|
+
case n
|
74
|
+
when 0
|
75
|
+
puts "Creating Organization: #{data[:name]}"
|
76
|
+
DataCatalog::Organization.create(data)
|
77
|
+
when 1
|
78
|
+
puts "Updating Organization: #{data[:name]}"
|
79
|
+
DataCatalog::Organization.update(docs[0].id, data)
|
80
|
+
else
|
81
|
+
multiple_matches("Organization", n, data)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Important: do not modify data
|
86
|
+
def create_or_update_source(data)
|
87
|
+
data = data.reject do |key, value|
|
88
|
+
[:organization, :downloads].include?(key)
|
89
|
+
end
|
90
|
+
docs = DataCatalog::Source.all(:url => data[:url])
|
91
|
+
n = docs.length
|
92
|
+
case n
|
93
|
+
when 0
|
94
|
+
puts "Creating Source: #{data[:title]}"
|
95
|
+
DataCatalog::Source.create(data)
|
96
|
+
when 1
|
97
|
+
puts "Updating Source: #{data[:title]}"
|
98
|
+
DataCatalog::Source.update(docs[0].id, data)
|
99
|
+
else
|
100
|
+
multiple_matches("Source", n, data)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Important: do not modify data
|
105
|
+
def create_or_update_download(source, data)
|
106
|
+
data = data.merge({:source_id => source.id})
|
107
|
+
docs = DataCatalog::Download.all({
|
108
|
+
:source_id => source.id,
|
109
|
+
:format => data[:format],
|
110
|
+
})
|
111
|
+
n = docs.length
|
112
|
+
case n
|
113
|
+
when 0
|
114
|
+
puts "- Creating Download: #{data[:format]}"
|
115
|
+
DataCatalog::Download.create(data)
|
116
|
+
when 1
|
117
|
+
puts "- Updating Download: #{data[:format]}"
|
118
|
+
DataCatalog::Download.update(docs[0].id, data)
|
119
|
+
else
|
120
|
+
multiple_matches("Download", n, data)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# ---
|
125
|
+
|
126
|
+
def multiple_matches(model, n, data)
|
127
|
+
puts "? : #{n} matches for #{model}"
|
128
|
+
DataCatalog::Report.create({
|
129
|
+
:status => "new",
|
130
|
+
:text => "Cannot automatically update #{model}; there " +
|
131
|
+
"are #{n} matches for url : #{data[:url]}",
|
132
|
+
:object => data.inspect,
|
133
|
+
})
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
data/lib/shared.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module DataCatalog
|
2
|
+
module Importer
|
3
|
+
module Shared
|
4
|
+
|
5
|
+
def folder(resource)
|
6
|
+
unless @options
|
7
|
+
raise Error, "@options is undefined"
|
8
|
+
end
|
9
|
+
unless cache_folder = @options[:cache_folder]
|
10
|
+
raise Error, "option :cache_folder is required"
|
11
|
+
end
|
12
|
+
File.join(@options[:cache_folder], resource.to_s)
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
class Hash
|
4
|
+
|
5
|
+
def to_yaml(opts = {})
|
6
|
+
YAML::quick_emit(object_id, opts) do |out|
|
7
|
+
out.map(taguri, to_yaml_style) do |map|
|
8
|
+
sorted = self.sort { |a, b| a[0] <=> b[0] }
|
9
|
+
sorted.each { |k, v| map.add(k, v) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
class Symbol
|
17
|
+
|
18
|
+
def <=>(other)
|
19
|
+
self.to_s <=> other.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/tasks.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module DataCatalog
|
2
|
+
module Importer
|
3
|
+
class Tasks
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
define(options)
|
7
|
+
end
|
8
|
+
|
9
|
+
protected
|
10
|
+
|
11
|
+
def define(options)
|
12
|
+
desc "Pull data from the #{options[:name]}"
|
13
|
+
task :pull do
|
14
|
+
puts "Pulling data from the #{options[:name]}..."
|
15
|
+
puller = DataCatalog::Importer::Puller.new({
|
16
|
+
:cache_folder => options[:cache_folder],
|
17
|
+
:pullers => options[:pullers],
|
18
|
+
})
|
19
|
+
puller.run
|
20
|
+
end
|
21
|
+
|
22
|
+
desc "Push data to the Data Catalog API"
|
23
|
+
task :push do
|
24
|
+
desc "Pushing data to the Data Catalog API..."
|
25
|
+
pusher = DataCatalog::Importer::Pusher.new({
|
26
|
+
:api_key => options[:api_key],
|
27
|
+
:base_uri => options[:base_uri],
|
28
|
+
:cache_folder => options[:cache_folder],
|
29
|
+
})
|
30
|
+
pusher.run
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/utility.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module DataCatalog
|
5
|
+
|
6
|
+
class Utility
|
7
|
+
|
8
|
+
def self.absolute_url(page_url, url)
|
9
|
+
Utility.plain_string(URI.parse(page_url).merge(url).to_s)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.single_line_clean(s)
|
13
|
+
plain_string(
|
14
|
+
s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.multi_line_clean(s)
|
18
|
+
plain_string(
|
19
|
+
s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.fetch(uri)
|
23
|
+
puts "Fetching #{uri}..."
|
24
|
+
io = open(uri, headers)
|
25
|
+
io.read
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.headers
|
29
|
+
{
|
30
|
+
"UserAgent" => "National Data Catalog Importer/0.1.0",
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.parse_file(filename)
|
35
|
+
File.open(filename) do |f|
|
36
|
+
Nokogiri::HTML::Document.parse(f)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.parse_file_or_uri(uri, file, options={})
|
41
|
+
if options[:force_fetch] || !File.exist?(file)
|
42
|
+
document = parse_uri(uri)
|
43
|
+
File.open(file, "w") { |f| f.write(document) }
|
44
|
+
end
|
45
|
+
parse_file(file) # Why always parse the file? See Note 001, below.
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.parse_uri(uri)
|
49
|
+
puts "Fetching #{uri}..."
|
50
|
+
open(uri, headers) do |io|
|
51
|
+
Nokogiri::HTML::Document.parse(io)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
|
56
|
+
# This method removes it so you can output clean YAML.
|
57
|
+
def self.plain_string(s)
|
58
|
+
if s.instance_variable_defined?(:@_rails_html_safe)
|
59
|
+
s.send(:remove_instance_variable, :@_rails_html_safe)
|
60
|
+
end
|
61
|
+
s
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.report_timing(label)
|
65
|
+
puts "Starting: [#{label}]"
|
66
|
+
t0 = Time.now
|
67
|
+
result = yield
|
68
|
+
t1 = Time.now
|
69
|
+
diff = t1 - t0
|
70
|
+
puts "Elapsed time [#{label}] %.2f s" % diff
|
71
|
+
result
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.setup_api(api_key, base_uri)
|
75
|
+
DataCatalog.api_key = api_key
|
76
|
+
DataCatalog.base_uri = base_uri
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.sort_uris(uris)
|
80
|
+
uris.sort do |uri_a, uri_b|
|
81
|
+
uid(uri_a) <=> uid(uri_b)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Converts a URI into a Unique ID
|
86
|
+
#
|
87
|
+
# For example:
|
88
|
+
# Utility.uid("http://data.octo.dc.gov/Metadata.aspx?id=137)
|
89
|
+
# => 137
|
90
|
+
def self.uid(uri)
|
91
|
+
last_part = uri.split("?id=").last
|
92
|
+
result = last_part.to_i
|
93
|
+
if result == 0
|
94
|
+
message = unindent <<-BLOCK
|
95
|
+
Could not make a non-zero Unique ID for this URI:
|
96
|
+
#{uri}
|
97
|
+
BLOCK
|
98
|
+
raise Error, message
|
99
|
+
end
|
100
|
+
result
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.write_yaml(filename, contents)
|
104
|
+
File.open(filename, "w") do |f|
|
105
|
+
YAML::dump(contents, f)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
# == Note 001 ==
|
114
|
+
#
|
115
|
+
# I have experienced a strange bug. When fetching a URI from the Web,
|
116
|
+
# I get a document that has a link that contains a pipe ("|"). This is a
|
117
|
+
# problem because calling URI.parse on a string with a pipe raises an
|
118
|
+
# exception:
|
119
|
+
#
|
120
|
+
# http://ouc.dc.gov/ouc/cwp/view,a,3,q,552751,oucNav,|32048|.asp
|
121
|
+
# bad URI(is not URI?)
|
122
|
+
#
|
123
|
+
# Workaround: I write the fetched document to disk first, and then read
|
124
|
+
# the document from disk. For some reason the URL gets read from disk as:
|
125
|
+
#
|
126
|
+
# http://ouc.dc.gov/ouc/cwp/view,a,3,q,552751,oucNav,%7C32048%7C.asp
|
127
|
+
#
|
128
|
+
# It would be nice to figure out why this is necessary.
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: datacatalog-importer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David James
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-05 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.4.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.9
|
34
|
+
version:
|
35
|
+
description: This framework makes it easier to write importers for the National Data Catalog.
|
36
|
+
email: djames@sunlightfoundation.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.md
|
44
|
+
files:
|
45
|
+
- .document
|
46
|
+
- .gitignore
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- VERSION
|
51
|
+
- datacatalog-importer.gemspec
|
52
|
+
- lib/datacatalog-importer.rb
|
53
|
+
- lib/importer.rb
|
54
|
+
- lib/puller.rb
|
55
|
+
- lib/pusher.rb
|
56
|
+
- lib/shared.rb
|
57
|
+
- lib/sort_yaml_hash.rb
|
58
|
+
- lib/tasks.rb
|
59
|
+
- lib/utility.rb
|
60
|
+
- spec/datacatalog-importer_spec.rb
|
61
|
+
- spec/spec.opts
|
62
|
+
- spec/spec_helper.rb
|
63
|
+
has_rdoc: true
|
64
|
+
homepage: http://github.com/djsun/datacatalog-importer
|
65
|
+
licenses: []
|
66
|
+
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options:
|
69
|
+
- --charset=UTF-8
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
requirements: []
|
85
|
+
|
86
|
+
rubyforge_project:
|
87
|
+
rubygems_version: 1.3.5
|
88
|
+
signing_key:
|
89
|
+
specification_version: 3
|
90
|
+
summary: A framework to write National Data Catalog importers
|
91
|
+
test_files:
|
92
|
+
- spec/datacatalog-importer_spec.rb
|
93
|
+
- spec/spec_helper.rb
|