data_kitten 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,156 @@
1
+ module DataKitten
2
+
3
+ # A specific available form of a dataset, such as a CSV file, an API, or an RSS feed.
4
+ #
5
+ # Based on {http://www.w3.org/TR/vocab-dcat/#class-distribution dcat:Distribution}, but
6
+ # with useful aliases for other vocabularies.
7
+ #
8
+ class Distribution
9
+
10
+ # @!attribute format
11
+ # @return [DistributionFormat] the file format of the distribution.
12
+ attr_accessor :format
13
+
14
+ # @!attribute access_url
15
+ # @return [String] a URL to access the distribution.
16
+ attr_accessor :access_url
17
+ alias_method :uri, :access_url
18
+ alias_method :download_url, :access_url
19
+
20
+ # @!attribute path
21
+ # @return [String] the path of the distribution within the source, if appropriate
22
+ attr_accessor :path
23
+
24
+ # @!attribute title
25
+ # @return [String] a short title, unique within the dataset
26
+ attr_accessor :title
27
+
28
+ # @!attribute description
29
+ # @return [String] a textual description
30
+ attr_accessor :description
31
+
32
+ # @!attribute schema
33
+ # @return [Hash] a hash representing the schema of the data within the distribution. Will
34
+ # change to a more structured object later.
35
+ attr_accessor :schema
36
+
37
+ # Create a new Distribution. Currently only loads from Datapackage +resource+ hashes.
38
+ #
39
+ # @param dataset [Dataset] the {Dataset} that this is a part of.
40
+ # @param options [Hash] A set of options with which to initialise the distribution.
41
+ # @option options [String] :datapackage_resource the +resource+ section of a Datapackage
42
+ # representation to load information from.
43
+ def initialize(dataset, options)
44
+ # Store dataset
45
+ @dataset = dataset
46
+ # Parse datapackage
47
+ if r = options[:datapackage_resource]
48
+ # Load basics
49
+ @description = r['description']
50
+ # Load HTTP Response for further use
51
+ if r['url']
52
+ @response = Curl::Easy.http_head(r['url'])
53
+ end
54
+ # Work out format
55
+ @format = begin
56
+ extension = r['format']
57
+ if extension.nil?
58
+ extension = r['path'].is_a?(String) ? r['path'].split('.').last.upcase : nil
59
+ end
60
+ extension ? DistributionFormat.new(extension, @response) : nil
61
+ end
62
+ # Get CSV dialect
63
+ @dialect = r['dialect']
64
+ # Extract schema
65
+ @schema = r['schema']
66
+ # Get path
67
+ @path = r['path']
68
+ @access_url = r['url']
69
+ # Set title
70
+ @title = @path || @uri
71
+ elsif r = options[:dcat_resource]
72
+ @title = r[:title]
73
+ @description = r[:title]
74
+ @access_url = r[:accessURL]
75
+ elsif r = options[:ckan_resource]
76
+ @title = r[:title]
77
+ @description = r[:title]
78
+ @access_url = r[:accessURL]
79
+ # Load HTTP Response for further use
80
+ if @access_url
81
+ @response = Curl::Easy.http_head(@access_url) do |c|
82
+ c.follow_location = true
83
+ c.useragent = "curb"
84
+ end
85
+ end
86
+ @format = r[:format] ? DistributionFormat.new(r[:format], @response) : nil
87
+ end
88
+ # Set default CSV dialect
89
+ @dialect ||= {
90
+ "delimiter" => ","
91
+ }
92
+ end
93
+
94
+ # A usable name for the distribution, unique within the {Dataset}.
95
+ #
96
+ # @return [String] a locally unique name
97
+ def title
98
+ @title
99
+ end
100
+ alias_method :name, :title
101
+
102
+ # An array of column headers for the distribution. Loaded from the schema, or from the file directly if no
103
+ # schema is present.
104
+ #
105
+ # @return [Array<String>] an array of column headers, as strings.
106
+ def headers
107
+ @headers ||= begin
108
+ if @schema
109
+ @schema['fields'].map{|x| x['id']}
110
+ else
111
+ data.headers
112
+ end
113
+ end
114
+ end
115
+
116
+ # Whether the file that the distribution represents actually exists
117
+ #
118
+ # @return [Boolean] whether the HTTP response returns a success code or not
119
+ def exists?
120
+ if @access_url
121
+ @response.response_code != 404
122
+ end
123
+ end
124
+
125
+ # A CSV object representing the loaded data.
126
+ #
127
+ # @return [Array<Array<String>>] an array of arrays of strings, representing each row.
128
+ def data
129
+ @data ||= begin
130
+ if @path
131
+ datafile = @dataset.send(:load_file, @path)
132
+ elsif @access_url
133
+ datafile = RestClient.get @access_url rescue nil
134
+ end
135
+ if datafile
136
+ case format.extension
137
+ when :csv
138
+ CSV.parse(
139
+ datafile,
140
+ :headers => true,
141
+ :col_sep => @dialect["delimiter"]
142
+ )
143
+ else
144
+ nil
145
+ end
146
+ else
147
+ nil
148
+ end
149
+ rescue
150
+ nil
151
+ end
152
+ end
153
+
154
+ end
155
+
156
+ end
@@ -0,0 +1,73 @@
1
+ module DataKitten
2
+
3
+ # A file format for a distribution
4
+ #
5
+ # For instance CSV, XML, etc.
6
+ #
7
+ class DistributionFormat
8
+
9
+ #@!attribute extension
10
+ #@return [Symbol] a symbol for the file extension. For instance, :csv.
11
+ attr_reader :extension
12
+
13
+ # Create a new DistributionFormat object with the relevant extension
14
+ #
15
+ # @param extension [String] the file extension for the format
16
+ def initialize(extension, response)
17
+ # Store extension as a lowercase symbol
18
+ @extension = extension.to_s.downcase.to_sym
19
+ # Store response for later use
20
+ @response = response
21
+ # Set up format lists
22
+ @@formats ||= {
23
+ csv: { structured: true, open: true },
24
+ xls: { structured: true, open: false },
25
+ rdf: { structured: true, open: true },
26
+ xml: { structured: true, open: true },
27
+ wms: { structured: true, open: true },
28
+ ods: { structured: true, open: true },
29
+ rdfa: { structured: true, open: true },
30
+ kml: { structured: true, open: true },
31
+ rss: { structured: true, open: true },
32
+ json: { structured: true, open: true },
33
+ ical: { structured: true, open: true },
34
+ sparql: { structured: true, open: true },
35
+ kml: { structured: true, open: true },
36
+ georss: { structured: true, open: true },
37
+ shp: { structured: true, open: true },
38
+ html: { structured: false, open: true },
39
+ doc: { structured: false, open: false },
40
+ pdf: { structured: false, open: true },
41
+ }
42
+ end
43
+
44
+ # Is this a structured format?
45
+ #
46
+ # @return [Boolean] whether the format is machine-readable or not.
47
+ def structured?
48
+ @@formats[@extension][:structured] rescue nil
49
+ end
50
+
51
+ # Is this an open format?
52
+ #
53
+ # @return [Boolean] whether the format is open or not
54
+ def open?
55
+ @@formats[@extension][:open] rescue nil
56
+ end
57
+
58
+ # Whether the format of the file matches the extension given by the data
59
+ #
60
+ # @return [Boolean] whether the MIME type given in the HTTP response matches the data or not
61
+ def matches?
62
+ begin
63
+ mimes = []
64
+ MIME::Types.type_for(@extension.to_s).each { |i| mimes << i.content_type }
65
+ !!(@response.content_type =~ /#{mimes.join('|')}/) || false
66
+ rescue
67
+ nil
68
+ end
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,23 @@
1
+ require 'data_kitten/hosts/github'
2
+ require 'data_kitten/hosts/bitbucket'
3
+ require 'data_kitten/hosts/gist'
4
+
5
+ module DataKitten
6
+
7
+ module Hosts
8
+
9
+ private
10
+
11
+ def detect_host
12
+ [
13
+ DataKitten::Hosts::Github,
14
+ DataKitten::Hosts::Bitbucket,
15
+ DataKitten::Hosts::Gist
16
+ ].each do |host|
17
+ extend host if host.supported?(@access_url)
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,54 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # Bitbucket host module. Automatically mixed into {Dataset} for datasets that are loaded from Bitbucket.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Bitbucket
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/[^\/]*bitbucket\.org\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:bitbucket+
21
+ # @see Dataset#host
22
+ def host
23
+ :bitbucket
24
+ end
25
+
26
+ # Helper for generating Bitbucket URLs
27
+ #
28
+ # @param path [String] The path to append to the Bitbucket base URL.
29
+ #
30
+ # @return [String] The supplied path with the Bitbucket base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
34
+ # dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
35
+ # dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
36
+ def bitbucket_path(path = '')
37
+ "https://bitbucket.org/#{bitbucket_user_name}/#{bitbucket_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def bitbucket_user_name
43
+ @bitbucket_user_name ||= uri.split('/')[-2]
44
+ end
45
+
46
+ def bitbucket_repository_name
47
+ @bitbucket_repository_name ||= uri.split('/')[-1].split('.')[0]
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,50 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # Gist host module. Automatically mixed into {Dataset} for datasets that are loaded from Gist.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Gist
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/gist\.github\.com\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:gist+
21
+ # @see Dataset#host
22
+ def host
23
+ :gist
24
+ end
25
+
26
+ # Helper for generating Gist URLs
27
+ #
28
+ # @param path [String] The path to append to the Gist base URL.
29
+ #
30
+ # @return [String] The supplied path with the Gist base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
34
+ # dataset.gist_path # => 'https://gist.github.com/5633865'
35
+ # dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
36
+ def gist_path(path = '')
37
+ "https://gist.github.com/#{gist_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def gist_repository_name
43
+ @gist_repository_name ||= uri.split('/')[-1].split('.')[0]
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,54 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # GitHub host module. Automatically mixed into {Dataset} for datasets that are loaded from GitHub.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Github
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/github\.com\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:github+
21
+ # @see Dataset#host
22
+ def host
23
+ :github
24
+ end
25
+
26
+ # Helper for generating GitHub URLs
27
+ #
28
+ # @param path [String] The path to append to the GitHub base URL.
29
+ #
30
+ # @return [String] The supplied path with the GitHub base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
34
+ # dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
35
+ # dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
36
+ def github_path(path = '')
37
+ "https://github.com/#{github_user_name}/#{github_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def github_user_name
43
+ @github_user_name ||= uri.split('/')[-2]
44
+ end
45
+
46
+ def github_repository_name
47
+ @github_repository_name ||= uri.split('/')[-1].split('.')[0]
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,39 @@
1
+ module DataKitten
2
+
3
+ # A license for a {Dataset} or {Distribution}
4
+ #
5
+ class License
6
+
7
+ # @!attribute is
8
+ # @return [String] a short ID that identifies the license.
9
+ attr_accessor :id
10
+
11
+ # @!attribute name
12
+ # @return [String] the human name of the license.
13
+ attr_accessor :name
14
+
15
+ # @!attribute uri
16
+ # @return [String] the URI for the license text.
17
+ attr_accessor :uri
18
+
19
+ # @!attribute type
20
+ # @return [String] the type of information this license applies to. Could be +:data+ or +:content+.
21
+ attr_accessor :type
22
+
23
+ # Create a new License object.
24
+ #
25
+ # @param options [Hash] A set of options with which to initialise the license.
26
+ # @option options [String] :id the short ID for the license
27
+ # @option options [String] :name the human name for the license
28
+ # @option options [String] :uri the URI of the license text
29
+ # @option options [String] :type the type of information covered by this license.
30
+ def initialize(options)
31
+ @id = options[:id]
32
+ @name = options[:name]
33
+ @uri = options[:uri]
34
+ @type = options[:type]
35
+ end
36
+
37
+ end
38
+
39
+ end