data_kitten 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ module DataKitten
2
+
3
+ # A specific available form of a dataset, such as a CSV file, an API, or an RSS feed.
4
+ #
5
+ # Based on {http://www.w3.org/TR/vocab-dcat/#class-distribution dcat:Distribution}, but
6
+ # with useful aliases for other vocabularies.
7
+ #
8
+ class Distribution
9
+
10
+ # @!attribute format
11
+ # @return [DistributionFormat] the file format of the distribution.
12
+ attr_accessor :format
13
+
14
+ # @!attribute access_url
15
+ # @return [String] a URL to access the distribution.
16
+ attr_accessor :access_url
17
+ alias_method :uri, :access_url
18
+ alias_method :download_url, :access_url
19
+
20
+ # @!attribute path
21
+ # @return [String] the path of the distribution within the source, if appropriate
22
+ attr_accessor :path
23
+
24
+ # @!attribute title
25
+ # @return [String] a short title, unique within the dataset
26
+ attr_accessor :title
27
+
28
+ # @!attribute description
29
+ # @return [String] a textual description
30
+ attr_accessor :description
31
+
32
+ # @!attribute schema
33
+ # @return [Hash] a hash representing the schema of the data within the distribution. Will
34
+ # change to a more structured object later.
35
+ attr_accessor :schema
36
+
37
+ # Create a new Distribution. Currently only loads from Datapackage +resource+ hashes.
38
+ #
39
+ # @param dataset [Dataset] the {Dataset} that this is a part of.
40
+ # @param options [Hash] A set of options with which to initialise the distribution.
41
+ # @option options [String] :datapackage_resource the +resource+ section of a Datapackage
42
+ # representation to load information from.
43
+ def initialize(dataset, options)
44
+ # Store dataset
45
+ @dataset = dataset
46
+ # Parse datapackage
47
+ if r = options[:datapackage_resource]
48
+ # Load basics
49
+ @description = r['description']
50
+ # Load HTTP Response for further use
51
+ if r['url']
52
+ @response = Curl::Easy.http_head(r['url'])
53
+ end
54
+ # Work out format
55
+ @format = begin
56
+ extension = r['format']
57
+ if extension.nil?
58
+ extension = r['path'].is_a?(String) ? r['path'].split('.').last.upcase : nil
59
+ end
60
+ extension ? DistributionFormat.new(extension, @response) : nil
61
+ end
62
+ # Get CSV dialect
63
+ @dialect = r['dialect']
64
+ # Extract schema
65
+ @schema = r['schema']
66
+ # Get path
67
+ @path = r['path']
68
+ @access_url = r['url']
69
+ # Set title
70
+ @title = @path || @uri
71
+ elsif r = options[:dcat_resource]
72
+ @title = r[:title]
73
+ @description = r[:title]
74
+ @access_url = r[:accessURL]
75
+ elsif r = options[:ckan_resource]
76
+ @title = r[:title]
77
+ @description = r[:title]
78
+ @access_url = r[:accessURL]
79
+ # Load HTTP Response for further use
80
+ if @access_url
81
+ @response = Curl::Easy.http_head(@access_url) do |c|
82
+ c.follow_location = true
83
+ c.useragent = "curb"
84
+ end
85
+ end
86
+ @format = r[:format] ? DistributionFormat.new(r[:format], @response) : nil
87
+ end
88
+ # Set default CSV dialect
89
+ @dialect ||= {
90
+ "delimiter" => ","
91
+ }
92
+ end
93
+
94
+ # A usable name for the distribution, unique within the {Dataset}.
95
+ #
96
+ # @return [String] a locally unique name
97
+ def title
98
+ @title
99
+ end
100
+ alias_method :name, :title
101
+
102
+ # An array of column headers for the distribution. Loaded from the schema, or from the file directly if no
103
+ # schema is present.
104
+ #
105
+ # @return [Array<String>] an array of column headers, as strings.
106
+ def headers
107
+ @headers ||= begin
108
+ if @schema
109
+ @schema['fields'].map{|x| x['id']}
110
+ else
111
+ data.headers
112
+ end
113
+ end
114
+ end
115
+
116
+ # Whether the file that the distribution represents actually exists
117
+ #
118
+ # @return [Boolean] whether the HTTP response returns a success code or not
119
+ def exists?
120
+ if @access_url
121
+ @response.response_code != 404
122
+ end
123
+ end
124
+
125
+ # A CSV object representing the loaded data.
126
+ #
127
+ # @return [Array<Array<String>>] an array of arrays of strings, representing each row.
128
+ def data
129
+ @data ||= begin
130
+ if @path
131
+ datafile = @dataset.send(:load_file, @path)
132
+ elsif @access_url
133
+ datafile = RestClient.get @access_url rescue nil
134
+ end
135
+ if datafile
136
+ case format.extension
137
+ when :csv
138
+ CSV.parse(
139
+ datafile,
140
+ :headers => true,
141
+ :col_sep => @dialect["delimiter"]
142
+ )
143
+ else
144
+ nil
145
+ end
146
+ else
147
+ nil
148
+ end
149
+ rescue
150
+ nil
151
+ end
152
+ end
153
+
154
+ end
155
+
156
+ end
@@ -0,0 +1,73 @@
1
+ module DataKitten
2
+
3
+ # A file format for a distribution
4
+ #
5
+ # For instance CSV, XML, etc.
6
+ #
7
+ class DistributionFormat
8
+
9
+ #@!attribute extension
10
+ #@return [Symbol] a symbol for the file extension. For instance, :csv.
11
+ attr_reader :extension
12
+
13
+ # Create a new DistributionFormat object with the relevant extension
14
+ #
15
+ # @param extension [String] the file extension for the format
16
+ def initialize(extension, response)
17
+ # Store extension as a lowercase symbol
18
+ @extension = extension.to_s.downcase.to_sym
19
+ # Store response for later use
20
+ @response = response
21
+ # Set up format lists
22
+ @@formats ||= {
23
+ csv: { structured: true, open: true },
24
+ xls: { structured: true, open: false },
25
+ rdf: { structured: true, open: true },
26
+ xml: { structured: true, open: true },
27
+ wms: { structured: true, open: true },
28
+ ods: { structured: true, open: true },
29
+ rdfa: { structured: true, open: true },
30
+ kml: { structured: true, open: true },
31
+ rss: { structured: true, open: true },
32
+ json: { structured: true, open: true },
33
+ ical: { structured: true, open: true },
34
+ sparql: { structured: true, open: true },
35
+ kml: { structured: true, open: true },
36
+ georss: { structured: true, open: true },
37
+ shp: { structured: true, open: true },
38
+ html: { structured: false, open: true },
39
+ doc: { structured: false, open: false },
40
+ pdf: { structured: false, open: true },
41
+ }
42
+ end
43
+
44
+ # Is this a structured format?
45
+ #
46
+ # @return [Boolean] whether the format is machine-readable or not.
47
+ def structured?
48
+ @@formats[@extension][:structured] rescue nil
49
+ end
50
+
51
+ # Is this an open format?
52
+ #
53
+ # @return [Boolean] whether the format is open or not
54
+ def open?
55
+ @@formats[@extension][:open] rescue nil
56
+ end
57
+
58
+ # Whether the format of the file matches the extension given by the data
59
+ #
60
+ # @return [Boolean] whether the MIME type given in the HTTP response matches the data or not
61
+ def matches?
62
+ begin
63
+ mimes = []
64
+ MIME::Types.type_for(@extension.to_s).each { |i| mimes << i.content_type }
65
+ !!(@response.content_type =~ /#{mimes.join('|')}/) || false
66
+ rescue
67
+ nil
68
+ end
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,23 @@
1
+ require 'data_kitten/hosts/github'
2
+ require 'data_kitten/hosts/bitbucket'
3
+ require 'data_kitten/hosts/gist'
4
+
5
+ module DataKitten
6
+
7
+ module Hosts
8
+
9
+ private
10
+
11
+ def detect_host
12
+ [
13
+ DataKitten::Hosts::Github,
14
+ DataKitten::Hosts::Bitbucket,
15
+ DataKitten::Hosts::Gist
16
+ ].each do |host|
17
+ extend host if host.supported?(@access_url)
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,54 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # Bitbucket host module. Automatically mixed into {Dataset} for datasets that are loaded from Bitbucket.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Bitbucket
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/[^\/]*bitbucket\.org\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:bitbucket+
21
+ # @see Dataset#host
22
+ def host
23
+ :bitbucket
24
+ end
25
+
26
+ # Helper for generating Bitbucket URLs
27
+ #
28
+ # @param path [String] The path to append to the Bitbucket base URL.
29
+ #
30
+ # @return [String] The supplied path with the Bitbucket base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
34
+ # dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
35
+ # dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
36
+ def bitbucket_path(path = '')
37
+ "https://bitbucket.org/#{bitbucket_user_name}/#{bitbucket_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def bitbucket_user_name
43
+ @bitbucket_user_name ||= uri.split('/')[-2]
44
+ end
45
+
46
+ def bitbucket_repository_name
47
+ @bitbucket_repository_name ||= uri.split('/')[-1].split('.')[0]
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,50 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # Gist host module. Automatically mixed into {Dataset} for datasets that are loaded from Gist.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Gist
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/gist\.github\.com\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:gist+
21
+ # @see Dataset#host
22
+ def host
23
+ :gist
24
+ end
25
+
26
+ # Helper for generating Gist URLs
27
+ #
28
+ # @param path [String] The path to append to the Gist base URL.
29
+ #
30
+ # @return [String] The supplied path with the Gist base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
34
+ # dataset.gist_path # => 'https://gist.github.com/5633865'
35
+ # dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
36
+ def gist_path(path = '')
37
+ "https://gist.github.com/#{gist_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def gist_repository_name
43
+ @gist_repository_name ||= uri.split('/')[-1].split('.')[0]
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,54 @@
1
+ module DataKitten
2
+
3
+ module Hosts
4
+
5
+ # GitHub host module. Automatically mixed into {Dataset} for datasets that are loaded from GitHub.
6
+ #
7
+ # @see Dataset
8
+ #
9
+ module Github
10
+
11
+ private
12
+
13
+ def self.supported?(uri)
14
+ uri =~ /\A(git|https?):\/\/github\.com\//
15
+ end
16
+
17
+ public
18
+
19
+ # Where the dataset is hosted.
20
+ # @return [Symbol] +:github+
21
+ # @see Dataset#host
22
+ def host
23
+ :github
24
+ end
25
+
26
+ # Helper for generating GitHub URLs
27
+ #
28
+ # @param path [String] The path to append to the GitHub base URL.
29
+ #
30
+ # @return [String] The supplied path with the GitHub base URL prepended
31
+ #
32
+ # @example
33
+ # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
34
+ # dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
35
+ # dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
36
+ def github_path(path = '')
37
+ "https://github.com/#{github_user_name}/#{github_repository_name}/#{path}"
38
+ end
39
+
40
+ private
41
+
42
+ def github_user_name
43
+ @github_user_name ||= uri.split('/')[-2]
44
+ end
45
+
46
+ def github_repository_name
47
+ @github_repository_name ||= uri.split('/')[-1].split('.')[0]
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,39 @@
1
+ module DataKitten
2
+
3
+ # A license for a {Dataset} or {Distribution}
4
+ #
5
+ class License
6
+
7
+ # @!attribute is
8
+ # @return [String] a short ID that identifies the license.
9
+ attr_accessor :id
10
+
11
+ # @!attribute name
12
+ # @return [String] the human name of the license.
13
+ attr_accessor :name
14
+
15
+ # @!attribute uri
16
+ # @return [String] the URI for the license text.
17
+ attr_accessor :uri
18
+
19
+ # @!attribute type
20
+ # @return [String] the type of information this license applies to. Could be +:data+ or +:content+.
21
+ attr_accessor :type
22
+
23
+ # Create a new License object.
24
+ #
25
+ # @param options [Hash] A set of options with which to initialise the license.
26
+ # @option options [String] :id the short ID for the license
27
+ # @option options [String] :name the human name for the license
28
+ # @option options [String] :uri the URI of the license text
29
+ # @option options [String] :type the type of information covered by this license.
30
+ def initialize(options)
31
+ @id = options[:id]
32
+ @name = options[:name]
33
+ @uri = options[:uri]
34
+ @type = options[:type]
35
+ end
36
+
37
+ end
38
+
39
+ end