datacatalog-importer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +0 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/datacatalog-importer.gemspec +65 -0
- data/lib/datacatalog-importer.rb +14 -0
- data/lib/importer.rb +7 -0
- data/lib/puller.rb +65 -0
- data/lib/pusher.rb +138 -0
- data/lib/shared.rb +17 -0
- data/lib/sort_yaml_hash.rb +22 -0
- data/lib/tasks.rb +36 -0
- data/lib/utility.rb +128 -0
- data/spec/datacatalog-importer_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +93 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Sunlight Labs
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "datacatalog-importer"
|
8
|
+
gem.summary = %Q{A framework to write National Data Catalog importers}
|
9
|
+
gem.description = %Q{This framework makes it easier to write importers for the National Data Catalog.}
|
10
|
+
gem.email = "djames@sunlightfoundation.com"
|
11
|
+
gem.homepage = "http://github.com/djsun/datacatalog-importer"
|
12
|
+
gem.authors = ["David James"]
|
13
|
+
gem.add_dependency "nokogiri", ">= 1.4.1"
|
14
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
15
|
+
# gem is a Gem::Specification...
|
16
|
+
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
# require 'rake/rdoctask'
|
24
|
+
# Rake::RDocTask.new do |rdoc|
|
25
|
+
# version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
26
|
+
#
|
27
|
+
# rdoc.rdoc_dir = 'rdoc'
|
28
|
+
# rdoc.title = "datacatalog-importer #{version}"
|
29
|
+
# rdoc.rdoc_files.include('README*')
|
30
|
+
# rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# require 'spec/rake/spectask'
|
34
|
+
# Spec::Rake::SpecTask.new(:spec) do |spec|
|
35
|
+
# spec.libs << 'lib' << 'spec'
|
36
|
+
# spec.spec_files = FileList['spec/**/*_spec.rb']
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
40
|
+
# spec.libs << 'lib' << 'spec'
|
41
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
42
|
+
# spec.rcov = true
|
43
|
+
# end
|
44
|
+
|
45
|
+
task :spec => :check_dependencies
|
46
|
+
|
47
|
+
task :default => :spec
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{datacatalog-importer}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["David James"]
|
12
|
+
s.date = %q{2010-02-05}
|
13
|
+
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
|
+
s.email = %q{djames@sunlightfoundation.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"datacatalog-importer.gemspec",
|
27
|
+
"lib/datacatalog-importer.rb",
|
28
|
+
"lib/importer.rb",
|
29
|
+
"lib/puller.rb",
|
30
|
+
"lib/pusher.rb",
|
31
|
+
"lib/shared.rb",
|
32
|
+
"lib/sort_yaml_hash.rb",
|
33
|
+
"lib/tasks.rb",
|
34
|
+
"lib/utility.rb",
|
35
|
+
"spec/datacatalog-importer_spec.rb",
|
36
|
+
"spec/spec.opts",
|
37
|
+
"spec/spec_helper.rb"
|
38
|
+
]
|
39
|
+
s.homepage = %q{http://github.com/djsun/datacatalog-importer}
|
40
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
41
|
+
s.require_paths = ["lib"]
|
42
|
+
s.rubygems_version = %q{1.3.5}
|
43
|
+
s.summary = %q{A framework to write National Data Catalog importers}
|
44
|
+
s.test_files = [
|
45
|
+
"spec/datacatalog-importer_spec.rb",
|
46
|
+
"spec/spec_helper.rb"
|
47
|
+
]
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
51
|
+
s.specification_version = 3
|
52
|
+
|
53
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
54
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
55
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
58
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
62
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# The following line is a workaround.
|
2
|
+
# See https://rails.lighthouseapp.com/projects/8994/tickets/2942-enumerableenumeratornext-causes-stack-level-too-deep-since-activesupport-233
|
3
|
+
#
|
4
|
+
# It allows Ruby 1.8.7's Enumerable#each to work properly. Otherwise,
|
5
|
+
# the 'active_support' gem loads the 'json' gem and there will be
|
6
|
+
# strange conflicts.
|
7
|
+
require 'generator.rb'
|
8
|
+
|
9
|
+
require File.dirname(__FILE__) + '/importer'
|
10
|
+
require File.dirname(__FILE__) + '/puller'
|
11
|
+
require File.dirname(__FILE__) + '/pusher'
|
12
|
+
require File.dirname(__FILE__) + '/sort_yaml_hash'
|
13
|
+
require File.dirname(__FILE__) + '/tasks'
|
14
|
+
require File.dirname(__FILE__) + '/utility'
|
data/lib/importer.rb
ADDED
data/lib/puller.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/shared'
|
2
|
+
|
3
|
+
module DataCatalog
|
4
|
+
module Importer
|
5
|
+
class Puller
|
6
|
+
include DataCatalog::Importer::Shared
|
7
|
+
|
8
|
+
REQUIRED = %w(cache_folder pullers)
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
REQUIRED.each do |r|
|
12
|
+
raise Error, "option :#{r} is required" unless options[r.intern]
|
13
|
+
end
|
14
|
+
@options = options
|
15
|
+
@counter = {
|
16
|
+
:source => 1,
|
17
|
+
:organization => 1,
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
Utility.report_timing "pull source" do
|
23
|
+
pull_resource(:source)
|
24
|
+
end
|
25
|
+
Utility.report_timing "pull organization" do
|
26
|
+
pull_resource(:organization)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
protected
|
31
|
+
|
32
|
+
# Note on HTTP Throttling
|
33
|
+
#
|
34
|
+
# It might make sense to throttle HTTP calls in
|
35
|
+
# * pull_organizations
|
36
|
+
# * pull_sources
|
37
|
+
#
|
38
|
+
# However, doing a simple sleep(TIME_DELAY) is too blunt.
|
39
|
+
# It makes sense when an HTTP call is made; however, it does
|
40
|
+
# not make sense when the importer uses a local cache.
|
41
|
+
#
|
42
|
+
# An alternative is to wrap HTTP calls in this Importer library.
|
43
|
+
# It could add a little bit of delay to HTTP calls that are made
|
44
|
+
# too rapidly.
|
45
|
+
#
|
46
|
+
def pull_resource(resource)
|
47
|
+
unless importer_class = @options[:pullers][resource]
|
48
|
+
raise Error, "options[:pullers][:#{r}] is required"
|
49
|
+
end
|
50
|
+
importer = importer_class.new
|
51
|
+
FileUtils.mkdir_p(folder(resource))
|
52
|
+
while (data = importer.fetch) do
|
53
|
+
write_data(resource, data)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def write_data(resource, data)
|
58
|
+
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
59
|
+
Utility.write_yaml(file, data)
|
60
|
+
@counter[resource] += 1
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/pusher.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'datacatalog'
|
2
|
+
require File.dirname(__FILE__) + '/shared'
|
3
|
+
|
4
|
+
module DataCatalog
|
5
|
+
module Importer
|
6
|
+
class Pusher
|
7
|
+
include DataCatalog::Importer::Shared
|
8
|
+
|
9
|
+
REQUIRED = %w(api_key base_uri cache_folder)
|
10
|
+
|
11
|
+
def initialize(options)
|
12
|
+
REQUIRED.each do |r|
|
13
|
+
raise Error, "option :#{r} is required" unless options[r.intern]
|
14
|
+
end
|
15
|
+
@options = options
|
16
|
+
end
|
17
|
+
|
18
|
+
def run
|
19
|
+
setup_api
|
20
|
+
Utility.report_timing "push organizations" do
|
21
|
+
push_organizations
|
22
|
+
end
|
23
|
+
Utility.report_timing "push sources" do
|
24
|
+
push_sources
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def setup_api
|
31
|
+
DataCatalog.api_key = @options[:api_key]
|
32
|
+
DataCatalog.base_uri = @options[:base_uri]
|
33
|
+
end
|
34
|
+
|
35
|
+
def push_organizations
|
36
|
+
read_data(:organization) do |data|
|
37
|
+
create_or_update_organization(data)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def push_sources
|
42
|
+
read_data(:source) do |data|
|
43
|
+
source = create_or_update_source(data)
|
44
|
+
data[:downloads].each do |download_data|
|
45
|
+
create_or_update_download(source, download_data)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# ---
|
51
|
+
|
52
|
+
def read_data(resource)
|
53
|
+
folder = folder(resource)
|
54
|
+
unless File.exist?(folder)
|
55
|
+
raise "Directory does not exist: #{folder}"
|
56
|
+
end
|
57
|
+
wildcard = folder + "/*.yml"
|
58
|
+
files = Dir.glob(wildcard)
|
59
|
+
if files.empty?
|
60
|
+
raise "No files found with: #{wildcard}"
|
61
|
+
end
|
62
|
+
files.sort.each do |f|
|
63
|
+
data = YAML::load_file(f)
|
64
|
+
yield data
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# ---
|
69
|
+
|
70
|
+
def create_or_update_organization(data)
|
71
|
+
docs = DataCatalog::Organization.all(:url => data[:url])
|
72
|
+
n = docs.length
|
73
|
+
case n
|
74
|
+
when 0
|
75
|
+
puts "Creating Organization: #{data[:name]}"
|
76
|
+
DataCatalog::Organization.create(data)
|
77
|
+
when 1
|
78
|
+
puts "Updating Organization: #{data[:name]}"
|
79
|
+
DataCatalog::Organization.update(docs[0].id, data)
|
80
|
+
else
|
81
|
+
multiple_matches("Organization", n, data)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Important: do not modify data
|
86
|
+
def create_or_update_source(data)
|
87
|
+
data = data.reject do |key, value|
|
88
|
+
[:organization, :downloads].include?(key)
|
89
|
+
end
|
90
|
+
docs = DataCatalog::Source.all(:url => data[:url])
|
91
|
+
n = docs.length
|
92
|
+
case n
|
93
|
+
when 0
|
94
|
+
puts "Creating Source: #{data[:title]}"
|
95
|
+
DataCatalog::Source.create(data)
|
96
|
+
when 1
|
97
|
+
puts "Updating Source: #{data[:title]}"
|
98
|
+
DataCatalog::Source.update(docs[0].id, data)
|
99
|
+
else
|
100
|
+
multiple_matches("Source", n, data)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Important: do not modify data
|
105
|
+
def create_or_update_download(source, data)
|
106
|
+
data = data.merge({:source_id => source.id})
|
107
|
+
docs = DataCatalog::Download.all({
|
108
|
+
:source_id => source.id,
|
109
|
+
:format => data[:format],
|
110
|
+
})
|
111
|
+
n = docs.length
|
112
|
+
case n
|
113
|
+
when 0
|
114
|
+
puts "- Creating Download: #{data[:format]}"
|
115
|
+
DataCatalog::Download.create(data)
|
116
|
+
when 1
|
117
|
+
puts "- Updating Download: #{data[:format]}"
|
118
|
+
DataCatalog::Download.update(docs[0].id, data)
|
119
|
+
else
|
120
|
+
multiple_matches("Download", n, data)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# ---
|
125
|
+
|
126
|
+
def multiple_matches(model, n, data)
|
127
|
+
puts "? : #{n} matches for #{model}"
|
128
|
+
DataCatalog::Report.create({
|
129
|
+
:status => "new",
|
130
|
+
:text => "Cannot automatically update #{model}; there " +
|
131
|
+
"are #{n} matches for url : #{data[:url]}",
|
132
|
+
:object => data.inspect,
|
133
|
+
})
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
data/lib/shared.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module DataCatalog
|
2
|
+
module Importer
|
3
|
+
module Shared
|
4
|
+
|
5
|
+
def folder(resource)
|
6
|
+
unless @options
|
7
|
+
raise Error, "@options is undefined"
|
8
|
+
end
|
9
|
+
unless cache_folder = @options[:cache_folder]
|
10
|
+
raise Error, "option :cache_folder is required"
|
11
|
+
end
|
12
|
+
File.join(@options[:cache_folder], resource.to_s)
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
class Hash
|
4
|
+
|
5
|
+
def to_yaml(opts = {})
|
6
|
+
YAML::quick_emit(object_id, opts) do |out|
|
7
|
+
out.map(taguri, to_yaml_style) do |map|
|
8
|
+
sorted = self.sort { |a, b| a[0] <=> b[0] }
|
9
|
+
sorted.each { |k, v| map.add(k, v) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
class Symbol
|
17
|
+
|
18
|
+
def <=>(other)
|
19
|
+
self.to_s <=> other.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/tasks.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module DataCatalog
|
2
|
+
module Importer
|
3
|
+
class Tasks
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
define(options)
|
7
|
+
end
|
8
|
+
|
9
|
+
protected
|
10
|
+
|
11
|
+
def define(options)
|
12
|
+
desc "Pull data from the #{options[:name]}"
|
13
|
+
task :pull do
|
14
|
+
puts "Pulling data from the #{options[:name]}..."
|
15
|
+
puller = DataCatalog::Importer::Puller.new({
|
16
|
+
:cache_folder => options[:cache_folder],
|
17
|
+
:pullers => options[:pullers],
|
18
|
+
})
|
19
|
+
puller.run
|
20
|
+
end
|
21
|
+
|
22
|
+
desc "Push data to the Data Catalog API"
|
23
|
+
task :push do
|
24
|
+
desc "Pushing data to the Data Catalog API..."
|
25
|
+
pusher = DataCatalog::Importer::Pusher.new({
|
26
|
+
:api_key => options[:api_key],
|
27
|
+
:base_uri => options[:base_uri],
|
28
|
+
:cache_folder => options[:cache_folder],
|
29
|
+
})
|
30
|
+
pusher.run
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/utility.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module DataCatalog
|
5
|
+
|
6
|
+
class Utility
|
7
|
+
|
8
|
+
def self.absolute_url(page_url, url)
|
9
|
+
Utility.plain_string(URI.parse(page_url).merge(url).to_s)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.single_line_clean(s)
|
13
|
+
plain_string(
|
14
|
+
s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.multi_line_clean(s)
|
18
|
+
plain_string(
|
19
|
+
s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.fetch(uri)
|
23
|
+
puts "Fetching #{uri}..."
|
24
|
+
io = open(uri, headers)
|
25
|
+
io.read
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.headers
|
29
|
+
{
|
30
|
+
"UserAgent" => "National Data Catalog Importer/0.1.0",
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.parse_file(filename)
|
35
|
+
File.open(filename) do |f|
|
36
|
+
Nokogiri::HTML::Document.parse(f)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.parse_file_or_uri(uri, file, options={})
|
41
|
+
if options[:force_fetch] || !File.exist?(file)
|
42
|
+
document = parse_uri(uri)
|
43
|
+
File.open(file, "w") { |f| f.write(document) }
|
44
|
+
end
|
45
|
+
parse_file(file) # Why always parse the file? See Note 001, below.
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.parse_uri(uri)
|
49
|
+
puts "Fetching #{uri}..."
|
50
|
+
open(uri, headers) do |io|
|
51
|
+
Nokogiri::HTML::Document.parse(io)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
|
56
|
+
# This method removes it so you can output clean YAML.
|
57
|
+
def self.plain_string(s)
|
58
|
+
if s.instance_variable_defined?(:@_rails_html_safe)
|
59
|
+
s.send(:remove_instance_variable, :@_rails_html_safe)
|
60
|
+
end
|
61
|
+
s
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.report_timing(label)
|
65
|
+
puts "Starting: [#{label}]"
|
66
|
+
t0 = Time.now
|
67
|
+
result = yield
|
68
|
+
t1 = Time.now
|
69
|
+
diff = t1 - t0
|
70
|
+
puts "Elapsed time [#{label}] %.2f s" % diff
|
71
|
+
result
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.setup_api(api_key, base_uri)
|
75
|
+
DataCatalog.api_key = api_key
|
76
|
+
DataCatalog.base_uri = base_uri
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.sort_uris(uris)
|
80
|
+
uris.sort do |uri_a, uri_b|
|
81
|
+
uid(uri_a) <=> uid(uri_b)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Converts a URI into a Unique ID
|
86
|
+
#
|
87
|
+
# For example:
|
88
|
+
# Utility.uid("http://data.octo.dc.gov/Metadata.aspx?id=137)
|
89
|
+
# => 137
|
90
|
+
def self.uid(uri)
|
91
|
+
last_part = uri.split("?id=").last
|
92
|
+
result = last_part.to_i
|
93
|
+
if result == 0
|
94
|
+
message = unindent <<-BLOCK
|
95
|
+
Could not make a non-zero Unique ID for this URI:
|
96
|
+
#{uri}
|
97
|
+
BLOCK
|
98
|
+
raise Error, message
|
99
|
+
end
|
100
|
+
result
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.write_yaml(filename, contents)
|
104
|
+
File.open(filename, "w") do |f|
|
105
|
+
YAML::dump(contents, f)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
# == Note 001 ==
|
114
|
+
#
|
115
|
+
# I have experienced a strange bug. When fetching a URI from the Web,
|
116
|
+
# I get a document that has a link that contains a pipe ("|"). This is a
|
117
|
+
# problem because calling URI.parse on a string with a pipe raises an
|
118
|
+
# exception:
|
119
|
+
#
|
120
|
+
# http://ouc.dc.gov/ouc/cwp/view,a,3,q,552751,oucNav,|32048|.asp
|
121
|
+
# bad URI(is not URI?)
|
122
|
+
#
|
123
|
+
# Workaround: I write the fetched document to disk first, and then read
|
124
|
+
# the document from disk. For some reason the URL gets read from disk as:
|
125
|
+
#
|
126
|
+
# http://ouc.dc.gov/ouc/cwp/view,a,3,q,552751,oucNav,%7C32048%7C.asp
|
127
|
+
#
|
128
|
+
# It would be nice to figure out why this is necessary.
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: datacatalog-importer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David James
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-05 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.4.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.9
|
34
|
+
version:
|
35
|
+
description: This framework makes it easier to write importers for the National Data Catalog.
|
36
|
+
email: djames@sunlightfoundation.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.md
|
44
|
+
files:
|
45
|
+
- .document
|
46
|
+
- .gitignore
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- VERSION
|
51
|
+
- datacatalog-importer.gemspec
|
52
|
+
- lib/datacatalog-importer.rb
|
53
|
+
- lib/importer.rb
|
54
|
+
- lib/puller.rb
|
55
|
+
- lib/pusher.rb
|
56
|
+
- lib/shared.rb
|
57
|
+
- lib/sort_yaml_hash.rb
|
58
|
+
- lib/tasks.rb
|
59
|
+
- lib/utility.rb
|
60
|
+
- spec/datacatalog-importer_spec.rb
|
61
|
+
- spec/spec.opts
|
62
|
+
- spec/spec_helper.rb
|
63
|
+
has_rdoc: true
|
64
|
+
homepage: http://github.com/djsun/datacatalog-importer
|
65
|
+
licenses: []
|
66
|
+
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options:
|
69
|
+
- --charset=UTF-8
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
requirements: []
|
85
|
+
|
86
|
+
rubyforge_project:
|
87
|
+
rubygems_version: 1.3.5
|
88
|
+
signing_key:
|
89
|
+
specification_version: 3
|
90
|
+
summary: A framework to write National Data Catalog importers
|
91
|
+
test_files:
|
92
|
+
- spec/datacatalog-importer_spec.rb
|
93
|
+
- spec/spec_helper.rb
|