data_kitten 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ require 'data_kitten/origins/git'
2
+ require 'data_kitten/origins/web_service'
3
+ require 'data_kitten/origins/html'
4
+ require 'data_kitten/origins/linked_data'
5
+
6
+ module DataKitten
7
+
8
+ module Origins
9
+
10
+ private
11
+
12
+ def detect_origin
13
+ [
14
+ DataKitten::Origins::Git,
15
+ DataKitten::Origins::HTML,
16
+ DataKitten::Origins::WebService,
17
+ DataKitten::Origins::LinkedData,
18
+ ].each do |origin|
19
+ if origin.supported?(@access_url)
20
+ extend origin
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,66 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Git origin module. Automatically mixed into {Dataset} for datasets that are loaded from Git repositories.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Git
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/.*\.git\Z/
15
+ end
16
+
17
+ public
18
+
19
+ # The origin type of the dataset.
20
+ # @return [Symbol] +:git+
21
+ # @see Dataset#origin
22
+ def origin
23
+ :git
24
+ end
25
+
26
+ # A history of changes to the Dataset, taken from the full git changelog
27
+ # @see Dataset#change_history
28
+ def change_history
29
+ @change_history ||= begin
30
+ repository.log.map{|commit| commit}
31
+ end
32
+ end
33
+
34
+ protected
35
+
36
+ def load_file(path)
37
+ # Make sure we have a working copy
38
+ repository
39
+ # read file
40
+ File.read(File.join(working_copy_path, path))
41
+ end
42
+
43
+ private
44
+
45
+ def working_copy_path
46
+ # Create holding directory
47
+ FileUtils.mkdir_p(File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories'))
48
+ # generate working copy dir
49
+ File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories', @access_url.gsub('/','-'))
50
+ end
51
+
52
+ def repository
53
+ @repository ||= begin
54
+ repo = ::Git.open(working_copy_path)
55
+ repo.pull("origin", "master")
56
+ repo
57
+ rescue ArgumentError
58
+ repo = ::Git.clone(@access_url, working_copy_path)
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,32 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # HTML origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module HTML
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ RestClient.get(uri).headers[:content_type] =~ /text\/html/
15
+ rescue
16
+ false
17
+ end
18
+
19
+ public
20
+
21
+ # The origin type of the dataset.
22
+ # @return [Symbol] +:html+
23
+ # @see Dataset#origin
24
+ def origin
25
+ :html
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,37 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Linked Data origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module LinkedData
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ content_type = RestClient.head(uri).headers[:content_type]
15
+ return nil unless content_type
16
+
17
+ return RDF::Format.content_types.keys.include?(
18
+ content_type.split(";").first )
19
+
20
+ rescue
21
+ false
22
+ end
23
+
24
+ public
25
+
26
+ # The origin type of the dataset.
27
+ # @return [Symbol] +:linkeddata+
28
+ # @see Dataset#origin
29
+ def origin
30
+ :linkeddata
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,30 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Web service origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module WebService
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ false
15
+ end
16
+
17
+ public
18
+
19
+ # The origin type of the dataset.
20
+ # @return [Symbol] +:web_service+
21
+ # @see Dataset#origin
22
+ def origin
23
+ :web_service
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,28 @@
1
+ require 'data_kitten/publishing_formats/datapackage'
2
+ require 'data_kitten/publishing_formats/rdfa'
3
+ require 'data_kitten/publishing_formats/linked_data'
4
+ require 'data_kitten/publishing_formats/ckan'
5
+
6
+ module DataKitten
7
+
8
+ module PublishingFormats
9
+
10
+ private
11
+
12
+ def detect_publishing_format
13
+ [
14
+ DataKitten::PublishingFormats::Datapackage,
15
+ DataKitten::PublishingFormats::CKAN,
16
+ DataKitten::PublishingFormats::RDFa,
17
+ DataKitten::PublishingFormats::LinkedData
18
+ ].each do |format|
19
+ if format.supported?(self)
20
+ extend format
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,187 @@
1
+ module DataKitten
2
+
3
+ module PublishingFormats
4
+
5
+ module CKAN
6
+
7
+ @@metadata = nil
8
+
9
+ private
10
+
11
+ def self.supported?(instance)
12
+ uri = URI(instance.uri)
13
+ package = uri.path.split("/").last
14
+ # If the package is a UUID - it's more than likely to be a CKAN ID
15
+ if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
16
+ @@id = package
17
+ else
18
+
19
+ results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
20
+
21
+ if results == ""
22
+ results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
23
+ end
24
+
25
+ result = JSON.parse results
26
+ @@id = result["result"]["id"] rescue result["id"]
27
+ @@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
28
+ end
29
+ rescue
30
+ false
31
+ end
32
+
33
+ public
34
+
35
+ # The publishing format for the dataset.
36
+ # @return [Symbol] +:ckan+
37
+ # @see Dataset#publishing_format
38
+ def publishing_format
39
+ :ckan
40
+ end
41
+
42
+ # The human-readable title of the dataset.
43
+ #
44
+ # @see Dataset#data_title
45
+ def data_title
46
+ metadata["title"] rescue nil
47
+ end
48
+
49
+ # A brief description of the dataset
50
+ #
51
+ # @see Dataset#description
52
+ def description
53
+ metadata["notes"] rescue nil
54
+ end
55
+
56
+ # Keywords for the dataset
57
+ #
58
+ # @see Dataset#keywords
59
+ def keywords
60
+ keywords = []
61
+ metadata["tags"].each do |tag|
62
+ keywords << tag
63
+ end
64
+ return keywords
65
+ rescue
66
+ []
67
+ end
68
+
69
+ # A list of publishers.
70
+ #
71
+ # @see Dataset#publishers
72
+ def publishers
73
+ id = metadata['organization']['id'] || metadata['groups'][0]
74
+ fetch_publisher(id)
75
+ rescue
76
+ []
77
+ end
78
+
79
+ # A list of licenses.
80
+ #
81
+ # @see Dataset#licenses
82
+ def licenses
83
+ uri = metadata["license_url"] || metadata["extras"]["licence_url"] rescue nil
84
+ name = metadata["license_title"] || metadata["extras"]["licence_url_title"] rescue nil
85
+ [
86
+ License.new(:id => metadata["license_id"],
87
+ :uri => uri,
88
+ :name => name
89
+ )
90
+ ]
91
+ rescue
92
+ []
93
+ end
94
+
95
+ # A list of distributions, referred to as +resources+ by Datapackage.
96
+ #
97
+ # @see Dataset#distributions
98
+ def distributions
99
+ distributions = []
100
+ metadata["resources"].each do |resource|
101
+ distribution = {
102
+ :title => resource["description"],
103
+ :accessURL => resource["url"],
104
+ :format => resource["format"]
105
+ }
106
+ distributions << Distribution.new(self, ckan_resource: distribution)
107
+ end
108
+ return distributions
109
+ rescue
110
+ nil
111
+ end
112
+
113
+ # How frequently the data is updated.
114
+ #
115
+ # @see Dataset#update_frequency
116
+ def update_frequency
117
+ metadata["extras"]["update_frequency"] || metadata["extras"]["frequency-of-update"] rescue nil
118
+ end
119
+
120
+ # Date the dataset was released
121
+ #
122
+ # @see Dataset#issued
123
+ def issued
124
+ Date.parse metadata["metadata_created"] rescue nil
125
+ end
126
+
127
+ # Date the dataset was modified
128
+ #
129
+ # @see Dataset#modified
130
+ def modified
131
+ Date.parse metadata["metadata_modified"] rescue nil
132
+ end
133
+
134
+ # The temporal coverage of the dataset
135
+ #
136
+ # @see Dataset#temporal
137
+ def temporal
138
+ start_date = Date.parse metadata["extras"]["temporal_coverage-from"] rescue nil
139
+ end_date = Date.parse metadata["extras"]["temporal_coverage-to"] rescue nil
140
+ Temporal.new(:start => start_date, :end => end_date)
141
+ end
142
+
143
+ private
144
+
145
+ def metadata
146
+ @@metadata
147
+ end
148
+
149
+ def select_extras(group, key)
150
+ extra = group["extras"][key] rescue ""
151
+ if extra == ""
152
+ extra = group['result']['extras'].select {|e| e["key"] == key }.first['value'] rescue ""
153
+ end
154
+ extra
155
+ end
156
+
157
+ def fetch_publisher(id)
158
+ uri = parsed_uri
159
+ [
160
+ "#{uri.scheme}://#{uri.host}/api/rest/group/#{id}",
161
+ "#{uri.scheme}://#{uri.host}/api/3/action/group_show?id=#{id}",
162
+ "#{uri.scheme}://#{uri.host}/api/3/action/organization_show?id=#{id}"
163
+ ].each do |uri|
164
+ begin
165
+ @group = JSON.parse RestClient.get uri
166
+ break
167
+ rescue RestClient::ResourceNotFound
168
+ nil
169
+ end
170
+ end
171
+
172
+ [
173
+ Agent.new(
174
+ :name => @group["display_name"] || @group["result"]["name"],
175
+ :homepage => select_extras(@group, "website-url"),
176
+ :mbox => select_extras(@group, "contact-email")
177
+ )
178
+ ]
179
+ end
180
+
181
+ def parsed_uri
182
+ URI(self.uri)
183
+ end
184
+
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,169 @@
1
+ module DataKitten
2
+
3
+ module PublishingFormats
4
+
5
+ # Datapackage metadata format module. Automatically mixed into {Dataset} for datasets that include a +datapackage.json+.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Datapackage
10
+
11
+ private
12
+
13
+ def self.supported?(instance)
14
+ begin
15
+ if instance.send(:origin) == :git
16
+ metadata = instance.send(:load_file, "datapackage.json")
17
+ datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
18
+ return datapackage.datapackage_version != nil
19
+ else
20
+ datapackage = DataPackage::Package.new( instance.uri )
21
+ return datapackage.datapackage_version != nil
22
+ end
23
+ rescue => e
24
+ false
25
+ end
26
+ end
27
+
28
+ public
29
+
30
+ # The publishing format for the dataset.
31
+ # @return [Symbol] +:datapackage+
32
+ # @see Dataset#publishing_format
33
+ def publishing_format
34
+ :datapackage
35
+ end
36
+
37
+ # A list of maintainers.
38
+ #
39
+ # @see Dataset#maintainers
40
+ def maintainers
41
+ package.maintainers.map do |x|
42
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
43
+ end
44
+ end
45
+
46
+ # A list of publishers.
47
+ #
48
+ # @see Dataset#publishers
49
+ def publishers
50
+ package.publisher.map do |x|
51
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
52
+ end
53
+ end
54
+
55
+ # A list of licenses.
56
+ #
57
+ # @see Dataset#licenses
58
+ def licenses
59
+ package.licenses.map do |x|
60
+ License.new(:id => x['id'], :uri => x['url'], :name => x['name'])
61
+ end
62
+ end
63
+
64
+ def rights
65
+ if package.property("rights")
66
+ Rights.new( ( package.property("rights", [])).each_with_object({}){|(k,v), h| h[k.to_sym] = v} )
67
+ else
68
+ nil
69
+ end
70
+ end
71
+
72
+ # A list of contributors.
73
+ #
74
+ # @see Dataset#contributors
75
+ def contributors
76
+ package.contributors.map do |x|
77
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
78
+ end
79
+ end
80
+
81
+ # A list of distributions, referred to as +resources+ by Datapackage.
82
+ #
83
+ # @see Dataset#distributions
84
+ def distributions
85
+ package.resources.map { |resource| Distribution.new(self, datapackage_resource: resource) }
86
+ end
87
+
88
+ # The human-readable title of the dataset.
89
+ #
90
+ # @see Dataset#data_title
91
+ def data_title
92
+ package.title || package.name
93
+ end
94
+
95
+ # A brief description of the dataset
96
+ #
97
+ # @see Dataset#description
98
+ def description
99
+ package.description
100
+ end
101
+
102
+ # Keywords for the dataset
103
+ #
104
+ # @see Dataset#keywords
105
+ def keywords
106
+ package.keywords
107
+ end
108
+
109
+ # Where the data is sourced from
110
+ #
111
+ # @see Dataset#sources
112
+ def sources
113
+ package.sources.map do |x|
114
+ Source.new(:label => x['name'], :resource => x['web'])
115
+ end
116
+ end
117
+
118
+ # Date the dataset was modified
119
+ def modified
120
+ package.last_modified
121
+ end
122
+
123
+ # A history of changes to the Dataset.
124
+ #
125
+ # If {Dataset#source} is +:git+, this is the git changelog for the actual distribution files, rather
126
+ # then the full unfiltered log.
127
+ #
128
+ # @return [Array] An array of changes. Exact format depends on the source.
129
+ #
130
+ # @see Dataset#change_history
131
+ def change_history
132
+ @change_history ||= begin
133
+ if origin == :git
134
+ # Get a log for each file in the local repo
135
+ logs = distributions.map do |file|
136
+ if file.path
137
+ log = repository.log.path(file.path)
138
+ # Convert to list of commits
139
+ log.map{|commit| commit}
140
+ else
141
+ []
142
+ end
143
+ end
144
+ # combine all logs, make unique, and re-sort in date order
145
+ logs.flatten.uniq.sort_by{|x| x.committer.date}.reverse
146
+ else
147
+ []
148
+ end
149
+ end
150
+ end
151
+
152
+ private
153
+
154
+ def package
155
+ if !@datapackage
156
+ if origin == :git
157
+ metadata = load_file("datapackage.json")
158
+ @datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
159
+ else
160
+ @datapackage = DataPackage::Package.new( access_url )
161
+ end
162
+ end
163
+ @datapackage
164
+ end
165
+ end
166
+
167
+ end
168
+
169
+ end