datapackage 0.1.3 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,147 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Tabular Data Package",
4
- "description": "Tabular Data Package is a simple specification for data access and delivery of tabular data.",
5
- "type": "object",
6
- "required": [ "name", "resources" ],
7
- "properties": {
8
- "name": {
9
- "$ref": "definitions.json#/define/name",
10
- "propertyOrder": 10
11
- },
12
- "title": {
13
- "$ref": "definitions.json#/define/title",
14
- "propertyOrder": 20
15
- },
16
- "description": {
17
- "$ref": "definitions.json#/define/description",
18
- "format": "textarea",
19
- "propertyOrder": 30
20
- },
21
- "homepage": {
22
- "$ref": "definitions.json#/define/homepage",
23
- "propertyOrder": 40
24
- },
25
- "version": {
26
- "$ref": "definitions.json#/define/version",
27
- "propertyOrder": 50
28
- },
29
- "license": {
30
- "$ref": "definitions.json#/define/license",
31
- "propertyOrder": 60
32
- },
33
- "author": {
34
- "$ref": "definitions.json#/define/author",
35
- "propertyOrder": 70
36
- },
37
- "contributors": {
38
- "$ref": "definitions.json#/define/contributors",
39
- "propertyOrder": 80,
40
- "options": { "hidden": true }
41
- },
42
- "resources": {
43
- "title": "Resources",
44
- "description": "The data resources that this package describes.",
45
- "type": "array",
46
- "propertyOrder": 90,
47
- "minItems": 0,
48
- "items": {
49
- "type": "object",
50
- "properties": {
51
- "name": {
52
- "$ref": "definitions.json#/define/name",
53
- "propertyOrder": 10
54
- },
55
- "title": {
56
- "$ref": "definitions.json#/define/title",
57
- "propertyOrder": 20
58
- },
59
- "description": {
60
- "$ref": "definitions.json#/define/description",
61
- "propertyOrder": 30,
62
- "format": "textarea"
63
- },
64
- "schema": {
65
- "$ref": "definitions.json#/define/schema",
66
- "propertyOrder": 40
67
- },
68
- "url": {
69
- "$ref": "definitions.json#/define/url",
70
- "propertyOrder": 50
71
- },
72
- "path": {
73
- "$ref": "definitions.json#/define/path",
74
- "propertyOrder": 60
75
- },
76
- "data": {
77
- "$ref": "definitions.json#/define/data",
78
- "propertyOrder": 70
79
- },
80
- "format": {
81
- "$ref": "definitions.json#/define/format",
82
- "propertyOrder": 80
83
- },
84
- "mediatype": {
85
- "$ref": "definitions.json#/define/mediatype",
86
- "propertyOrder": 90
87
- },
88
- "encoding": {
89
- "$ref": "definitions.json#/define/encoding",
90
- "propertyOrder": 100
91
- },
92
- "bytes": {
93
- "$ref": "definitions.json#/define/bytes",
94
- "propertyOrder": 110,
95
- "options": { "hidden": true }
96
- },
97
- "hash": {
98
- "$ref": "definitions.json#/define/hash",
99
- "propertyOrder": 120,
100
- "options": { "hidden": true }
101
- },
102
- "dialect": {
103
- "$ref": "definitions.json#/define/dialect",
104
- "propertyOrder": 130,
105
- "options": { "hidden": true }
106
- },
107
- "sources": {
108
- "$ref": "definitions.json#/define/sources",
109
- "propertyOrder": 140,
110
- "options": { "hidden": true }
111
- },
112
- "license": {
113
- "$ref": "definitions.json#/define/license",
114
- "description": "The license under which the resource is published.",
115
- "propertyOrder": 150,
116
- "options": { "hidden": true }
117
- }
118
- },
119
- "anyOf": [
120
- { "title": "url required", "required": ["url"] },
121
- { "title": "path required", "required": ["path"] },
122
- { "title": "data required", "required": ["data"] }
123
- ],
124
- "required": [ "schema" ]
125
- }
126
- },
127
- "keywords": {
128
- "$ref": "definitions.json#/define/keywords",
129
- "propertyOrder": 100
130
- },
131
- "sources": {
132
- "$ref": "definitions.json#/define/sources",
133
- "propertyOrder": 110,
134
- "options": { "hidden": true }
135
- },
136
- "image": {
137
- "$ref": "definitions.json#/define/image",
138
- "propertyOrder": 120,
139
- "options": { "hidden": true }
140
- },
141
- "dataDependencies": {
142
- "$ref": "definitions.json#/define/dataDependencies",
143
- "propertyOrder": 140,
144
- "options": { "hidden": true }
145
- }
146
- }
147
- }
File without changes
@@ -1,102 +0,0 @@
1
- import os
2
- import csv
3
- import urllib
4
- import unittest
5
-
6
- BASE_PATH = os.path.abspath(
7
- os.path.join(
8
- os.path.dirname(__file__),
9
- '..'
10
- )
11
- )
12
- REGISTRY_PATH = os.path.join(BASE_PATH, 'registry.csv')
13
-
14
-
15
- class TestRegistry(unittest.TestCase):
16
- def test_registry_has_the_expected_headers(self):
17
- expected_headers = (
18
- 'id',
19
- 'title',
20
- 'schema',
21
- 'schema_path',
22
- 'specification',
23
- )
24
-
25
- with open(REGISTRY_PATH, 'r', newline='') as f:
26
- headers = next(csv.reader(f))
27
-
28
- self.assertEqual(sorted(headers), sorted(expected_headers))
29
-
30
- def test_registry_schemas_have_the_required_attributes(self):
31
- required_attributes = (
32
- 'id',
33
- 'title',
34
- 'schema',
35
- 'schema_path',
36
- 'specification',
37
- )
38
-
39
- with open(REGISTRY_PATH, 'r', newline='') as f:
40
- registry = csv.DictReader(f)
41
- msg = "Schema '{0}' doesn't define required attribute '{1}'"
42
-
43
- for schema in registry:
44
- for key, value in schema.items():
45
- if key in required_attributes:
46
- assert value != '', msg.format(schema['id'], key)
47
-
48
- def test_registry_schemas_have_unique_ids(self):
49
- with open(REGISTRY_PATH, 'r', newline='') as f:
50
- registry = csv.DictReader(f)
51
- ids = [schema['id'] for schema in registry]
52
-
53
- assert len(ids) == len(set(ids)), "The schemas IDs aren't unique"
54
-
55
- def test_schema_paths_exist_and_are_files(self):
56
- with open(REGISTRY_PATH, 'r', newline='') as f:
57
- registry = csv.DictReader(f)
58
-
59
- for entry in registry:
60
- schema_path = entry['schema_path']
61
- msg = "schema_path '{0}' of schema '{1}' isn't a file"
62
- msg = msg.format(schema_path, entry['id'])
63
- path = os.path.join(BASE_PATH, schema_path)
64
- assert os.path.isfile(path), msg
65
-
66
- def test_schema_urls_exist(self):
67
- is_successful = lambda req: req.status >= 200 and req.status < 400
68
- is_redirect = lambda req: req.status >= 300 and req.status < 400
69
-
70
- with open(REGISTRY_PATH, 'r', newline='') as f:
71
- registry = csv.DictReader(f)
72
-
73
- for entry in registry:
74
- try:
75
- url = entry['schema']
76
- res = self._make_head_request(url)
77
- msg = "Error fetching schema_url '{0}' of schema '{1}'"
78
- msg = msg.format(url, entry['id'])
79
- assert (is_successful(res) or is_redirect(res)), msg
80
- except urllib.error.URLError as e:
81
- raise Exception(msg) from e
82
-
83
- def test_specification_urls_exist(self):
84
- is_successful = lambda req: req.status >= 200 and req.status < 400
85
- is_redirect = lambda req: req.status >= 300 and req.status < 400
86
-
87
- with open(REGISTRY_PATH, 'r', newline='') as f:
88
- registry = csv.DictReader(f)
89
-
90
- for entry in registry:
91
- try:
92
- url = entry['schema']
93
- res = self._make_head_request(url)
94
- msg = "Error fetching specification '{0}' of schema '{1}'"
95
- msg = msg.format(url, entry['id'])
96
- assert (is_successful(res) or is_redirect(res)), msg
97
- except urllib.error.URLError as e:
98
- raise Exception(msg) from e
99
-
100
- def _make_head_request(self, url):
101
- req = urllib.request.Request(url, method='HEAD')
102
- return urllib.request.urlopen(req)
@@ -1,41 +0,0 @@
1
- import os
2
- import glob
3
- import json
4
- import unittest
5
- import jsonschema
6
-
7
- BASE_PATH = os.path.abspath(
8
- os.path.join(
9
- os.path.dirname(__file__),
10
- '..'
11
- )
12
- )
13
-
14
-
15
- class TestSchemas(unittest.TestCase):
16
- def test_json_files_must_be_valid(self):
17
- json_glob = os.path.join(BASE_PATH, '*.json')
18
- json_paths = glob.glob(json_glob)
19
-
20
- for json_path in json_paths:
21
- try:
22
- with open(json_path, 'r') as f:
23
- json.load(f)
24
- except ValueError as e:
25
- msg = "File '{0}' isn\'t a valid JSON."
26
- raise ValueError(msg.format(json_path)) from e
27
-
28
- def test_json_files_must_be_valid_json_schemas(self):
29
- json_glob = os.path.join(BASE_PATH, '*.json')
30
- json_paths = glob.glob(json_glob)
31
-
32
- for json_path in json_paths:
33
- with open(json_path, 'r') as f:
34
- schema = json.load(f)
35
- try:
36
- validator_class = jsonschema.validators.validator_for(schema)
37
- validator = validator_class(schema)
38
- validator.check_schema(schema)
39
- except jsonschema.exceptions.SchemaError as e:
40
- msg = "File '{0}' isn\'t a valid JSON Schema."
41
- raise ValueError(msg.format(json_path)) from e
@@ -1,12 +0,0 @@
1
- module DataPackage
2
- class RegistryError < StandardError; end
3
-
4
- class SchemaException < Exception
5
- attr_reader :status, :message
6
-
7
- def initialize status
8
- @status = status
9
- @message = status
10
- end
11
- end
12
- end
@@ -1,181 +0,0 @@
1
- require 'open-uri'
2
-
3
- module DataPackage
4
- class Package < Hash
5
- attr_reader :opts, :errors
6
- attr_writer :resources
7
-
8
- # Parse or create a data package
9
- #
10
- # Supports reading data from JSON file, directory, and a URL
11
- #
12
- # package:: Hash or a String
13
- # schema:: Hash, Symbol or String
14
- # opts:: Options used to customize reading and parsing
15
- def initialize(package = nil, schema = :base, opts = {})
16
- @opts = opts
17
- @schema = DataPackage::Schema.new(schema || :base)
18
- @dead_resources = []
19
-
20
- self.merge! parse_package(package)
21
- define_properties!
22
- load_resources!
23
- end
24
-
25
- def parse_package(package)
26
- # TODO: base directory/url
27
- if package.nil?
28
- {}
29
- elsif package.class == Hash
30
- package
31
- else
32
- read_package(package)
33
- end
34
- end
35
-
36
- # Returns the directory for a local file package or base url for a remote
37
- # Returns nil for an in-memory object (because it has no base as yet)
38
- def base
39
- # user can override base
40
- return @opts[:base] if @opts[:base]
41
- return '' unless @location
42
- # work out base directory or uri
43
- if local?
44
- return File.dirname(@location)
45
- else
46
- return @location.split('/')[0..-2].join('/')
47
- end
48
- end
49
-
50
- # Is this a local package? Returns true if created from an in-memory object or a file/directory reference
51
- def local?
52
- return @local if @local
53
- return !@location.start_with?('http') if @location
54
- true
55
- end
56
-
57
- def resources
58
- update_resources!
59
- @resources
60
- end
61
-
62
- def property(property, default = nil)
63
- self[property] || default
64
- end
65
-
66
- def valid?
67
- validate
68
- @valid
69
- end
70
-
71
- def validate
72
- @errors = @schema.validation_errors(self)
73
- @valid = @schema.valid?(self)
74
- end
75
-
76
- def resource_exists?(location)
77
- @dead_resources.include?(location)
78
- end
79
-
80
- def to_json
81
- self.to_json
82
- end
83
-
84
- private
85
-
86
- def define_properties!
87
- (@schema['properties'] || {}).each do |k, v|
88
- next if k == 'resources'
89
- define_singleton_method("#{k.to_sym}=", proc { |p| set_property(k, p) })
90
- define_singleton_method(k.to_sym.to_s, proc { property k, default_value(v) })
91
- end
92
- end
93
-
94
- def load_resources!
95
- @resources = (self['resources'] || [])
96
- update_resources!
97
- end
98
-
99
- def update_resources!
100
- @resources.map! do |resource|
101
- begin
102
- load_resource(resource)
103
- rescue
104
- @dead_resources << resource['path']
105
- nil
106
- end
107
- end
108
- end
109
-
110
- def load_resource(resource)
111
- if resource.is_a?(Resource)
112
- resource
113
- else
114
- Resource.load(resource, base)
115
- end
116
- end
117
-
118
- def default_value(schema_data)
119
- case schema_data['type']
120
- when 'string'
121
- nil
122
- when 'array'
123
- []
124
- when 'object'
125
- {}
126
- end
127
- end
128
-
129
- def set_property(key, value)
130
- self[key] = value
131
- end
132
-
133
- def read_package(package)
134
- if is_directory?(package)
135
- package = File.join(package, opts[:default_filename] || 'datapackage.json')
136
- elsif is_containing_url?(package)
137
- package = URI.join(package, 'datapackage.json')
138
- end
139
-
140
- @location = package.to_s
141
-
142
- if File.extname(package.to_s) == '.zip'
143
- unzip_package(package)
144
- else
145
- JSON.parse open(package).read
146
- end
147
- end
148
-
149
- def is_directory?(package)
150
- !package.start_with?('http') && File.directory?(package)
151
- end
152
-
153
- def is_containing_url?(package)
154
- package.start_with?('http') && !package.end_with?('datapackage.json', 'datapackage.zip')
155
- end
156
-
157
- def write_to_tempfile(url)
158
- tempfile = Tempfile.new('datapackage')
159
- tempfile.write(open(url).read)
160
- tempfile.rewind
161
- tempfile
162
- end
163
-
164
- def unzip_package(package)
165
- package = write_to_tempfile(package) if package.start_with?('http')
166
- dir = Dir.mktmpdir
167
- Zip::File.open(package) do |zip_file|
168
- # Extract all the files
169
- zip_file.each { |entry| entry.extract("#{dir}/#{File.basename entry.name}") }
170
- # Get and parse the datapackage metadata
171
- entry = zip_file.glob("*/#{opts[:default_filename] || 'datapackage.json'}").first
172
- package = JSON.parse(entry.get_input_stream.read)
173
- end
174
- # Set the base dir to the directory we unzipped to
175
- @opts[:base] = dir
176
- # This is now a local file, not a URL
177
- @local = true
178
- package
179
- end
180
- end
181
- end
@@ -1,81 +0,0 @@
1
- module DataPackage
2
- ##
3
- # Allow loading Data Package profiles from a registry.
4
-
5
- class Registry
6
-
7
- DEFAULT_REGISTRY_URL = 'http://schemas.datapackages.org/registry.csv'
8
- DEFAULT_REGISTRY_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '..', '..', 'datapackage', 'schemas', 'registry.csv')
9
-
10
- attr_reader :base_path
11
-
12
- def initialize(registry_path_or_url = DEFAULT_REGISTRY_PATH)
13
- registry_path_or_url ||= DEFAULT_REGISTRY_PATH
14
- if File.file?(registry_path_or_url)
15
- @base_path = File.dirname(
16
- File.absolute_path(registry_path_or_url)
17
- )
18
- end
19
- @profiles = {}
20
- @registry = get_registry(registry_path_or_url)
21
- end
22
-
23
- def get(profile_id)
24
- @profiles[profile_id] ||= get_profile(profile_id)
25
- end
26
-
27
- def available_profiles
28
- @registry
29
- end
30
-
31
- private
32
-
33
- def get_profile(profile_id)
34
- profile_metadata = @registry[profile_id]
35
- return if profile_metadata.nil?
36
-
37
- path = get_absolute_path(profile_metadata[:schema_path])
38
-
39
- if path && File.file?(path)
40
- load_json(path)
41
- else
42
- url = profile_metadata[:schema]
43
- load_json(url)
44
- end
45
- end
46
-
47
- def get_registry(registry_path_or_url)
48
- begin
49
- csv = parse_csv(registry_path_or_url)
50
- registry = {}
51
- csv.each { |row| registry[row.fetch(:id)] = Hash[row.headers.zip(row.fields)] }
52
- rescue KeyError, OpenURI::HTTPError, Errno::ENOENT
53
- raise(RegistryError)
54
- end
55
- registry
56
- end
57
-
58
- def parse_csv(path_or_url)
59
- csv = open(path_or_url).read
60
- if csv.match(/,/)
61
- CSV.new(csv, headers: :first_row, header_converters: :symbol)
62
- else
63
- raise RegistryError
64
- end
65
- end
66
-
67
- def get_absolute_path(relative_path)
68
- File.join(@base_path, relative_path)
69
- rescue TypeError
70
- nil
71
- end
72
-
73
- def load_json(path)
74
- json = open(path).read
75
- JSON.parse(json)
76
- rescue JSON::ParserError, OpenURI::HTTPError
77
- raise RegistryError
78
- end
79
-
80
- end
81
- end
@@ -1,83 +0,0 @@
1
- module DataPackage
2
- class Resource < Hash
3
-
4
- def initialize(resource, base_path = '')
5
- self.merge! resource
6
- end
7
-
8
- def self.load(resource, base_path = '')
9
- # This returns if there are no alternative ways to access the data OR there
10
- # is a base_path which is a URL
11
- if is_url?(resource, base_path)
12
- RemoteResource.new(resource, base_path)
13
- else
14
- # If there's a data attribute, we definitely want an inline resource
15
- if resource['data']
16
- InlineResource.new(resource)
17
- else
18
- # If the file exists - we want a local resource
19
- if file_exists?(resource, base_path)
20
- LocalResource.new(resource, base_path)
21
- # If it doesn't exist and there's a URL to grab the data from, we want
22
- # a remote resource
23
- elsif resource['url']
24
- RemoteResource.new(resource, base_path)
25
- end
26
- end
27
- end
28
- end
29
-
30
- def self.file_exists?(resource, base_path)
31
- path = resource['path']
32
- path = File.join(base_path, path) if base_path != ''
33
- File.exists?(path)
34
- end
35
-
36
- def self.is_url?(resource, base_path)
37
- return true if resource['url'] != nil && resource['path'] == nil && resource['data'] == nil
38
- return true if base_path.start_with?('http')
39
- end
40
-
41
- def table
42
- @table ||= JsonTableSchema::Table.new(CSV.parse(data), self['schema']) if self['schema']
43
- end
44
-
45
- end
46
-
47
- class LocalResource < Resource
48
-
49
- def initialize(resource, base_path = '')
50
- @base_path = base_path
51
- @path = resource['path']
52
- super
53
- end
54
-
55
- def data
56
- @path = File.join(@base_path, @path) if @base_path != ''
57
- open(@path).read
58
- end
59
-
60
- end
61
-
62
- class InlineResource < Resource
63
- def data
64
- self['data']
65
- end
66
- end
67
-
68
- class RemoteResource < Resource
69
-
70
- def initialize(resource, base_url = '')
71
- @base_url = base_url
72
- @url = resource['url']
73
- @path = resource['path']
74
- super
75
- end
76
-
77
- def data
78
- url = @url ? @url : URI.join(@base_url, @path)
79
- open(url).read
80
- end
81
-
82
- end
83
- end