datacatalog-importer 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.1"
8
+ s.version = "0.1.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-02-19}
12
+ s.date = %q{2010-03-02}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
data/lib/puller.rb CHANGED
@@ -45,7 +45,7 @@ module DataCatalog
45
45
  #
46
46
  def pull_resource(resource)
47
47
  unless importer_class = @options[:pullers][resource]
48
- raise Error, "options[:pullers][:#{r}] is required"
48
+ raise Error, "options[:pullers][:#{resource}] is required"
49
49
  end
50
50
  importer = importer_class.new
51
51
  FileUtils.mkdir_p(folder(resource))
data/lib/utility.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'fastercsv'
1
2
  require 'nokogiri'
2
3
  require 'open-uri'
3
4
 
@@ -30,28 +31,51 @@ module DataCatalog
30
31
  "UserAgent" => "National Data Catalog Importer/0.1.0",
31
32
  }
32
33
  end
34
+
35
+ def self.parse_csv_from_file(filename, extra_header_rows=0)
36
+ File.open(filename) do |f|
37
+ extra_header_rows.times { f.gets } # ignore these rows
38
+ FasterCSV.parse(f)
39
+ end
40
+ end
33
41
 
34
- def self.parse_file(filename)
42
+ def self.parse_html_from_file(filename)
35
43
  File.open(filename) do |f|
36
44
  Nokogiri::HTML::Document.parse(f)
37
45
  end
38
46
  end
39
47
 
40
- def self.parse_file_or_uri(uri, file, options={})
48
+ def self.parse_csv_from_file_or_uri(uri, file, options={})
41
49
  if options[:force_fetch] || !File.exist?(file)
42
- document = parse_uri(uri)
50
+ document = parse_csv_from_uri(uri)
43
51
  File.open(file, "w") { |f| f.write(document) }
44
52
  end
45
- parse_file(file) # Why always parse the file? See Note 001, below.
53
+ parse_csv_from_file(file, options[:extra_header_rows] || 0)
54
+ # Why always parse the file? Consistency with parse_html_from_file_or_uri.
46
55
  end
47
-
48
- def self.parse_uri(uri)
56
+
57
+ def self.parse_html_from_file_or_uri(uri, file, options={})
58
+ if options[:force_fetch] || !File.exist?(file)
59
+ document = parse_html_from_uri(uri)
60
+ File.open(file, "w") { |f| f.write(document) }
61
+ end
62
+ parse_html_from_file(file) # Why always parse the file? See Note 001, below.
63
+ end
64
+
65
+ def self.parse_csv_from_uri(uri)
49
66
  puts "Fetching #{uri}..."
50
67
  open(uri, headers) do |io|
51
68
  Nokogiri::HTML::Document.parse(io)
52
69
  end
53
70
  end
54
71
 
72
+ def self.parse_html_from_uri(uri)
73
+ puts "Fetching #{uri}..."
74
+ open(uri, headers) do |io|
75
+ FasterCSV.parse(io)
76
+ end
77
+ end
78
+
55
79
  # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
56
80
  # This method removes it so you can output clean YAML.
57
81
  def self.plain_string(s)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - David James
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-19 00:00:00 -05:00
12
+ date: 2010-03-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency