datacatalog-importer 0.1.10 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.10
1
+ 0.1.11
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.10"
8
+ s.version = "0.1.11"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-03-17}
12
+ s.date = %q{2010-04-05}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
data/lib/utility.rb CHANGED
@@ -6,9 +6,13 @@ module DataCatalog
6
6
 
7
7
  class Utility
8
8
 
9
+ # == URLs ==
10
+
9
11
  def self.absolute_url(page_url, url)
10
12
  Utility.plain_string(URI.parse(page_url).merge(url).to_s)
11
13
  end
14
+
15
+ # == Cleaning ==
12
16
 
13
17
  def self.single_line_clean(s)
14
18
  plain_string(
@@ -20,10 +24,20 @@ module DataCatalog
20
24
  s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
21
25
  end
22
26
 
23
- def self.fetch(uri)
24
- puts "Fetching #{uri}..."
25
- io = open(uri, headers)
26
- io.read
27
+ # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
28
+ # This method removes it so you can output clean YAML.
29
+ def self.plain_string(s)
30
+ if s.instance_variable_defined?(:@_rails_html_safe)
31
+ s.send(:remove_instance_variable, :@_rails_html_safe)
32
+ end
33
+ s
34
+ end
35
+
36
+ # == API ===
37
+
38
+ def self.setup_api(api_key, base_uri)
39
+ DataCatalog.api_key = api_key
40
+ DataCatalog.base_uri = base_uri
27
41
  end
28
42
 
29
43
  def self.headers
@@ -31,7 +45,27 @@ module DataCatalog
31
45
  "UserAgent" => "National Data Catalog Importer/0.1.6",
32
46
  }
33
47
  end
48
+
49
+ # == Various ==
34
50
 
51
+ def self.fetch(uri)
52
+ puts "Fetching #{uri}..."
53
+ io = open(uri, headers)
54
+ io.read
55
+ end
56
+
57
+ def self.report_timing(label)
58
+ puts "Starting: [#{label}]"
59
+ t0 = Time.now
60
+ result = yield
61
+ t1 = Time.now
62
+ diff = t1 - t0
63
+ puts "Elapsed time [#{label}] %.2f s" % diff
64
+ result
65
+ end
66
+
67
+ # == CSV ==
68
+
35
69
  # { :headers => true } is a common option
36
70
  def self.parse_csv_from_file(filename, options={})
37
71
  extra_header_rows = options.delete(:extra_header_rows) || 0
@@ -41,12 +75,13 @@ module DataCatalog
41
75
  end
42
76
  end
43
77
 
44
- def self.parse_html_from_file(filename)
45
- File.open(filename) do |f|
46
- Nokogiri::HTML::Document.parse(f)
47
- end
78
+ def self.parse_csv_from_uri(uri, options={})
79
+ puts "Fetching #{uri}..."
80
+ data = open(uri, headers)
81
+ puts data.inspect
82
+ FasterCSV.parse(data, options)
48
83
  end
49
-
84
+
50
85
  def self.parse_csv_from_file_or_uri(uri, file, options={})
51
86
  force_fetch = options.delete(:force_fetch) || false
52
87
  if force_fetch || !File.exist?(file)
@@ -56,19 +91,12 @@ module DataCatalog
56
91
  parse_csv_from_file(file, options)
57
92
  end
58
93
 
59
- def self.parse_html_from_file_or_uri(uri, file, options={})
60
- if options[:force_fetch] || !File.exist?(file)
61
- document = parse_html_from_uri(uri)
62
- File.open(file, "w") { |f| f.write(document) }
94
+ # == HTML ==
95
+
96
+ def self.parse_html_from_file(filename)
97
+ File.open(filename) do |f|
98
+ Nokogiri::HTML::Document.parse(f)
63
99
  end
64
- parse_html_from_file(file) # Why always parse the file? See Note 001, below.
65
- end
66
-
67
- def self.parse_csv_from_uri(uri, options={})
68
- puts "Fetching #{uri}..."
69
- data = open(uri, headers)
70
- puts data.inspect
71
- FasterCSV.parse(data, options)
72
100
  end
73
101
 
74
102
  def self.parse_html_from_uri(uri)
@@ -78,30 +106,39 @@ module DataCatalog
78
106
  end
79
107
  end
80
108
 
81
- # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
82
- # This method removes it so you can output clean YAML.
83
- def self.plain_string(s)
84
- if s.instance_variable_defined?(:@_rails_html_safe)
85
- s.send(:remove_instance_variable, :@_rails_html_safe)
109
+ def self.parse_html_from_file_or_uri(uri, file, options={})
110
+ if options[:force_fetch] || !File.exist?(file)
111
+ document = parse_html_from_uri(uri)
112
+ File.open(file, "w") { |f| f.write(document) }
86
113
  end
87
- s
114
+ parse_html_from_file(file) # Why always parse the file? See Note 001, below.
88
115
  end
116
+
117
+ # == XML
89
118
 
90
- def self.report_timing(label)
91
- puts "Starting: [#{label}]"
92
- t0 = Time.now
93
- result = yield
94
- t1 = Time.now
95
- diff = t1 - t0
96
- puts "Elapsed time [#{label}] %.2f s" % diff
97
- result
119
+ def self.parse_xml_from_file(filename)
120
+ File.open(filename) do |f|
121
+ Nokogiri::XML::Document.parse(f)
122
+ end
98
123
  end
99
124
 
100
- def self.setup_api(api_key, base_uri)
101
- DataCatalog.api_key = api_key
102
- DataCatalog.base_uri = base_uri
125
+ def self.parse_xml_from_uri(uri)
126
+ puts "Fetching #{uri}..."
127
+ Nokogiri::XML(open(uri))
103
128
  end
104
-
129
+
130
+ def self.parse_xml_from_file_or_uri(uri, file, options={})
131
+ if options[:force_fetch] || !File.exist?(file)
132
+ document = parse_xml_from_uri(uri)
133
+ File.open(file, "w") { |f| f.write(document) }
134
+ end
135
+ parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
136
+ end
137
+
138
+ # == YAML
139
+
140
+ # To load YAML use: YAML::load_file(filename)
141
+
105
142
  def self.write_yaml(filename, contents)
106
143
  File.open(filename, "w") do |f|
107
144
  YAML::dump(contents, f)
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 10
9
- version: 0.1.10
8
+ - 11
9
+ version: 0.1.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - David James
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-17 00:00:00 -04:00
17
+ date: 2010-04-05 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency