data_kitten 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,28 @@
1
+ require 'data_kitten/origins/git'
2
+ require 'data_kitten/origins/web_service'
3
+ require 'data_kitten/origins/html'
4
+ require 'data_kitten/origins/linked_data'
5
+
6
+ module DataKitten
7
+
8
+ module Origins
9
+
10
+ private
11
+
12
+ def detect_origin
13
+ [
14
+ DataKitten::Origins::Git,
15
+ DataKitten::Origins::HTML,
16
+ DataKitten::Origins::WebService,
17
+ DataKitten::Origins::LinkedData,
18
+ ].each do |origin|
19
+ if origin.supported?(@access_url)
20
+ extend origin
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,66 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Git origin module. Automatically mixed into {Dataset} for datasets that are loaded from Git repositories.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Git
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/.*\.git\Z/
15
+ end
16
+
17
+ public
18
+
19
+ # The origin type of the dataset.
20
+ # @return [Symbol] +:git+
21
+ # @see Dataset#origin
22
+ def origin
23
+ :git
24
+ end
25
+
26
+ # A history of changes to the Dataset, taken from the full git changelog
27
+ # @see Dataset#change_history
28
+ def change_history
29
+ @change_history ||= begin
30
+ repository.log.map{|commit| commit}
31
+ end
32
+ end
33
+
34
+ protected
35
+
36
+ def load_file(path)
37
+ # Make sure we have a working copy
38
+ repository
39
+ # read file
40
+ File.read(File.join(working_copy_path, path))
41
+ end
42
+
43
+ private
44
+
45
+ def working_copy_path
46
+ # Create holding directory
47
+ FileUtils.mkdir_p(File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories'))
48
+ # generate working copy dir
49
+ File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories', @access_url.gsub('/','-'))
50
+ end
51
+
52
+ def repository
53
+ @repository ||= begin
54
+ repo = ::Git.open(working_copy_path)
55
+ repo.pull("origin", "master")
56
+ repo
57
+ rescue ArgumentError
58
+ repo = ::Git.clone(@access_url, working_copy_path)
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,32 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # HTML origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module HTML
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ RestClient.get(uri).headers[:content_type] =~ /text\/html/
15
+ rescue
16
+ false
17
+ end
18
+
19
+ public
20
+
21
+ # The origin type of the dataset.
22
+ # @return [Symbol] +:html+
23
+ # @see Dataset#origin
24
+ def origin
25
+ :html
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,37 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Linked Data origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module LinkedData
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ content_type = RestClient.head(uri).headers[:content_type]
15
+ return nil unless content_type
16
+
17
+ return RDF::Format.content_types.keys.include?(
18
+ content_type.split(";").first )
19
+
20
+ rescue
21
+ false
22
+ end
23
+
24
+ public
25
+
26
+ # The origin type of the dataset.
27
+ # @return [Symbol] +:linkeddata+
28
+ # @see Dataset#origin
29
+ def origin
30
+ :linkeddata
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,30 @@
1
+ module DataKitten
2
+
3
+ module Origins
4
+
5
+ # Web service origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module WebService
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ false
15
+ end
16
+
17
+ public
18
+
19
+ # The origin type of the dataset.
20
+ # @return [Symbol] +:web_service+
21
+ # @see Dataset#origin
22
+ def origin
23
+ :web_service
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,28 @@
1
+ require 'data_kitten/publishing_formats/datapackage'
2
+ require 'data_kitten/publishing_formats/rdfa'
3
+ require 'data_kitten/publishing_formats/linked_data'
4
+ require 'data_kitten/publishing_formats/ckan'
5
+
6
+ module DataKitten
7
+
8
+ module PublishingFormats
9
+
10
+ private
11
+
12
+ def detect_publishing_format
13
+ [
14
+ DataKitten::PublishingFormats::Datapackage,
15
+ DataKitten::PublishingFormats::CKAN,
16
+ DataKitten::PublishingFormats::RDFa,
17
+ DataKitten::PublishingFormats::LinkedData
18
+ ].each do |format|
19
+ if format.supported?(self)
20
+ extend format
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,187 @@
1
+ module DataKitten
2
+
3
+ module PublishingFormats
4
+
5
+ module CKAN
6
+
7
+ @@metadata = nil
8
+
9
+ private
10
+
11
+ def self.supported?(instance)
12
+ uri = URI(instance.uri)
13
+ package = uri.path.split("/").last
14
+ # If the package is a UUID - it's more than likely to be a CKAN ID
15
+ if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
16
+ @@id = package
17
+ else
18
+
19
+ results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
20
+
21
+ if results == ""
22
+ results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
23
+ end
24
+
25
+ result = JSON.parse results
26
+ @@id = result["result"]["id"] rescue result["id"]
27
+ @@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
28
+ end
29
+ rescue
30
+ false
31
+ end
32
+
33
+ public
34
+
35
+ # The publishing format for the dataset.
36
+ # @return [Symbol] +:ckan+
37
+ # @see Dataset#publishing_format
38
+ def publishing_format
39
+ :ckan
40
+ end
41
+
42
+ # The human-readable title of the dataset.
43
+ #
44
+ # @see Dataset#data_title
45
+ def data_title
46
+ metadata["title"] rescue nil
47
+ end
48
+
49
+ # A brief description of the dataset
50
+ #
51
+ # @see Dataset#description
52
+ def description
53
+ metadata["notes"] rescue nil
54
+ end
55
+
56
+ # Keywords for the dataset
57
+ #
58
+ # @see Dataset#keywords
59
+ def keywords
60
+ keywords = []
61
+ metadata["tags"].each do |tag|
62
+ keywords << tag
63
+ end
64
+ return keywords
65
+ rescue
66
+ []
67
+ end
68
+
69
+ # A list of publishers.
70
+ #
71
+ # @see Dataset#publishers
72
+ def publishers
73
+ id = metadata['organization']['id'] || metadata['groups'][0]
74
+ fetch_publisher(id)
75
+ rescue
76
+ []
77
+ end
78
+
79
+ # A list of licenses.
80
+ #
81
+ # @see Dataset#licenses
82
+ def licenses
83
+ uri = metadata["license_url"] || metadata["extras"]["licence_url"] rescue nil
84
+ name = metadata["license_title"] || metadata["extras"]["licence_url_title"] rescue nil
85
+ [
86
+ License.new(:id => metadata["license_id"],
87
+ :uri => uri,
88
+ :name => name
89
+ )
90
+ ]
91
+ rescue
92
+ []
93
+ end
94
+
95
+ # A list of distributions, referred to as +resources+ by Datapackage.
96
+ #
97
+ # @see Dataset#distributions
98
+ def distributions
99
+ distributions = []
100
+ metadata["resources"].each do |resource|
101
+ distribution = {
102
+ :title => resource["description"],
103
+ :accessURL => resource["url"],
104
+ :format => resource["format"]
105
+ }
106
+ distributions << Distribution.new(self, ckan_resource: distribution)
107
+ end
108
+ return distributions
109
+ rescue
110
+ nil
111
+ end
112
+
113
+ # How frequently the data is updated.
114
+ #
115
+ # @see Dataset#update_frequency
116
+ def update_frequency
117
+ metadata["extras"]["update_frequency"] || metadata["extras"]["frequency-of-update"] rescue nil
118
+ end
119
+
120
+ # Date the dataset was released
121
+ #
122
+ # @see Dataset#issued
123
+ def issued
124
+ Date.parse metadata["metadata_created"] rescue nil
125
+ end
126
+
127
+ # Date the dataset was modified
128
+ #
129
+ # @see Dataset#modified
130
+ def modified
131
+ Date.parse metadata["metadata_modified"] rescue nil
132
+ end
133
+
134
+ # The temporal coverage of the dataset
135
+ #
136
+ # @see Dataset#temporal
137
+ def temporal
138
+ start_date = Date.parse metadata["extras"]["temporal_coverage-from"] rescue nil
139
+ end_date = Date.parse metadata["extras"]["temporal_coverage-to"] rescue nil
140
+ Temporal.new(:start => start_date, :end => end_date)
141
+ end
142
+
143
+ private
144
+
145
+ def metadata
146
+ @@metadata
147
+ end
148
+
149
+ def select_extras(group, key)
150
+ extra = group["extras"][key] rescue ""
151
+ if extra == ""
152
+ extra = group['result']['extras'].select {|e| e["key"] == key }.first['value'] rescue ""
153
+ end
154
+ extra
155
+ end
156
+
157
+ def fetch_publisher(id)
158
+ uri = parsed_uri
159
+ [
160
+ "#{uri.scheme}://#{uri.host}/api/rest/group/#{id}",
161
+ "#{uri.scheme}://#{uri.host}/api/3/action/group_show?id=#{id}",
162
+ "#{uri.scheme}://#{uri.host}/api/3/action/organization_show?id=#{id}"
163
+ ].each do |uri|
164
+ begin
165
+ @group = JSON.parse RestClient.get uri
166
+ break
167
+ rescue RestClient::ResourceNotFound
168
+ nil
169
+ end
170
+ end
171
+
172
+ [
173
+ Agent.new(
174
+ :name => @group["display_name"] || @group["result"]["name"],
175
+ :homepage => select_extras(@group, "website-url"),
176
+ :mbox => select_extras(@group, "contact-email")
177
+ )
178
+ ]
179
+ end
180
+
181
+ def parsed_uri
182
+ URI(self.uri)
183
+ end
184
+
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,169 @@
1
+ module DataKitten
2
+
3
+ module PublishingFormats
4
+
5
+ # Datapackage metadata format module. Automatically mixed into {Dataset} for datasets that include a +datapackage.json+.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Datapackage
10
+
11
+ private
12
+
13
+ def self.supported?(instance)
14
+ begin
15
+ if instance.send(:origin) == :git
16
+ metadata = instance.send(:load_file, "datapackage.json")
17
+ datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
18
+ return datapackage.datapackage_version != nil
19
+ else
20
+ datapackage = DataPackage::Package.new( instance.uri )
21
+ return datapackage.datapackage_version != nil
22
+ end
23
+ rescue => e
24
+ false
25
+ end
26
+ end
27
+
28
+ public
29
+
30
+ # The publishing format for the dataset.
31
+ # @return [Symbol] +:datapackage+
32
+ # @see Dataset#publishing_format
33
+ def publishing_format
34
+ :datapackage
35
+ end
36
+
37
+ # A list of maintainers.
38
+ #
39
+ # @see Dataset#maintainers
40
+ def maintainers
41
+ package.maintainers.map do |x|
42
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
43
+ end
44
+ end
45
+
46
+ # A list of publishers.
47
+ #
48
+ # @see Dataset#publishers
49
+ def publishers
50
+ package.publisher.map do |x|
51
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
52
+ end
53
+ end
54
+
55
+ # A list of licenses.
56
+ #
57
+ # @see Dataset#licenses
58
+ def licenses
59
+ package.licenses.map do |x|
60
+ License.new(:id => x['id'], :uri => x['url'], :name => x['name'])
61
+ end
62
+ end
63
+
64
+ def rights
65
+ if package.property("rights")
66
+ Rights.new( ( package.property("rights", [])).each_with_object({}){|(k,v), h| h[k.to_sym] = v} )
67
+ else
68
+ nil
69
+ end
70
+ end
71
+
72
+ # A list of contributors.
73
+ #
74
+ # @see Dataset#contributors
75
+ def contributors
76
+ package.contributors.map do |x|
77
+ Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
78
+ end
79
+ end
80
+
81
+ # A list of distributions, referred to as +resources+ by Datapackage.
82
+ #
83
+ # @see Dataset#distributions
84
+ def distributions
85
+ package.resources.map { |resource| Distribution.new(self, datapackage_resource: resource) }
86
+ end
87
+
88
+ # The human-readable title of the dataset.
89
+ #
90
+ # @see Dataset#data_title
91
+ def data_title
92
+ package.title || package.name
93
+ end
94
+
95
+ # A brief description of the dataset
96
+ #
97
+ # @see Dataset#description
98
+ def description
99
+ package.description
100
+ end
101
+
102
+ # Keywords for the dataset
103
+ #
104
+ # @see Dataset#keywords
105
+ def keywords
106
+ package.keywords
107
+ end
108
+
109
+ # Where the data is sourced from
110
+ #
111
+ # @see Dataset#sources
112
+ def sources
113
+ package.sources.map do |x|
114
+ Source.new(:label => x['name'], :resource => x['web'])
115
+ end
116
+ end
117
+
118
+ # Date the dataset was modified
119
+ def modified
120
+ package.last_modified
121
+ end
122
+
123
+ # A history of changes to the Dataset.
124
+ #
125
+ # If {Dataset#source} is +:git+, this is the git changelog for the actual distribution files, rather
126
+ # then the full unfiltered log.
127
+ #
128
+ # @return [Array] An array of changes. Exact format depends on the source.
129
+ #
130
+ # @see Dataset#change_history
131
+ def change_history
132
+ @change_history ||= begin
133
+ if origin == :git
134
+ # Get a log for each file in the local repo
135
+ logs = distributions.map do |file|
136
+ if file.path
137
+ log = repository.log.path(file.path)
138
+ # Convert to list of commits
139
+ log.map{|commit| commit}
140
+ else
141
+ []
142
+ end
143
+ end
144
+ # combine all logs, make unique, and re-sort in date order
145
+ logs.flatten.uniq.sort_by{|x| x.committer.date}.reverse
146
+ else
147
+ []
148
+ end
149
+ end
150
+ end
151
+
152
+ private
153
+
154
+ def package
155
+ if !@datapackage
156
+ if origin == :git
157
+ metadata = load_file("datapackage.json")
158
+ @datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
159
+ else
160
+ @datapackage = DataPackage::Package.new( access_url )
161
+ end
162
+ end
163
+ @datapackage
164
+ end
165
+ end
166
+
167
+ end
168
+
169
+ end