datacatalog-importer 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.1"
8
+ s.version = "0.1.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-02-19}
12
+ s.date = %q{2010-03-02}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
data/lib/puller.rb CHANGED
@@ -45,7 +45,7 @@ module DataCatalog
45
45
  #
46
46
  def pull_resource(resource)
47
47
  unless importer_class = @options[:pullers][resource]
48
- raise Error, "options[:pullers][:#{r}] is required"
48
+ raise Error, "options[:pullers][:#{resource}] is required"
49
49
  end
50
50
  importer = importer_class.new
51
51
  FileUtils.mkdir_p(folder(resource))
data/lib/utility.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'fastercsv'
1
2
  require 'nokogiri'
2
3
  require 'open-uri'
3
4
 
@@ -30,28 +31,51 @@ module DataCatalog
30
31
  "UserAgent" => "National Data Catalog Importer/0.1.0",
31
32
  }
32
33
  end
34
+
35
+ def self.parse_csv_from_file(filename, extra_header_rows=0)
36
+ File.open(filename) do |f|
37
+ extra_header_rows.times { f.gets } # ignore these rows
38
+ FasterCSV.parse(f)
39
+ end
40
+ end
33
41
 
34
- def self.parse_file(filename)
42
+ def self.parse_html_from_file(filename)
35
43
  File.open(filename) do |f|
36
44
  Nokogiri::HTML::Document.parse(f)
37
45
  end
38
46
  end
39
47
 
40
- def self.parse_file_or_uri(uri, file, options={})
48
+ def self.parse_csv_from_file_or_uri(uri, file, options={})
41
49
  if options[:force_fetch] || !File.exist?(file)
42
- document = parse_uri(uri)
50
+ document = parse_csv_from_uri(uri)
43
51
  File.open(file, "w") { |f| f.write(document) }
44
52
  end
45
- parse_file(file) # Why always parse the file? See Note 001, below.
53
+ parse_csv_from_file(file, options[:extra_header_rows] || 0)
54
+ # Why always parse the file? Consistency with parse_html_from_file_or_uri.
46
55
  end
47
-
48
- def self.parse_uri(uri)
56
+
57
+ def self.parse_html_from_file_or_uri(uri, file, options={})
58
+ if options[:force_fetch] || !File.exist?(file)
59
+ document = parse_html_from_uri(uri)
60
+ File.open(file, "w") { |f| f.write(document) }
61
+ end
62
+ parse_html_from_file(file) # Why always parse the file? See Note 001, below.
63
+ end
64
+
65
+ def self.parse_csv_from_uri(uri)
49
66
  puts "Fetching #{uri}..."
50
67
  open(uri, headers) do |io|
51
68
  Nokogiri::HTML::Document.parse(io)
52
69
  end
53
70
  end
54
71
 
72
+ def self.parse_html_from_uri(uri)
73
+ puts "Fetching #{uri}..."
74
+ open(uri, headers) do |io|
75
+ FasterCSV.parse(io)
76
+ end
77
+ end
78
+
55
79
  # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
56
80
  # This method removes it so you can output clean YAML.
57
81
  def self.plain_string(s)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - David James
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-19 00:00:00 -05:00
12
+ date: 2010-03-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency