datapackage 0.1.3 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,147 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Tabular Data Package",
4
- "description": "Tabular Data Package is a simple specification for data access and delivery of tabular data.",
5
- "type": "object",
6
- "required": [ "name", "resources" ],
7
- "properties": {
8
- "name": {
9
- "$ref": "definitions.json#/define/name",
10
- "propertyOrder": 10
11
- },
12
- "title": {
13
- "$ref": "definitions.json#/define/title",
14
- "propertyOrder": 20
15
- },
16
- "description": {
17
- "$ref": "definitions.json#/define/description",
18
- "format": "textarea",
19
- "propertyOrder": 30
20
- },
21
- "homepage": {
22
- "$ref": "definitions.json#/define/homepage",
23
- "propertyOrder": 40
24
- },
25
- "version": {
26
- "$ref": "definitions.json#/define/version",
27
- "propertyOrder": 50
28
- },
29
- "license": {
30
- "$ref": "definitions.json#/define/license",
31
- "propertyOrder": 60
32
- },
33
- "author": {
34
- "$ref": "definitions.json#/define/author",
35
- "propertyOrder": 70
36
- },
37
- "contributors": {
38
- "$ref": "definitions.json#/define/contributors",
39
- "propertyOrder": 80,
40
- "options": { "hidden": true }
41
- },
42
- "resources": {
43
- "title": "Resources",
44
- "description": "The data resources that this package describes.",
45
- "type": "array",
46
- "propertyOrder": 90,
47
- "minItems": 0,
48
- "items": {
49
- "type": "object",
50
- "properties": {
51
- "name": {
52
- "$ref": "definitions.json#/define/name",
53
- "propertyOrder": 10
54
- },
55
- "title": {
56
- "$ref": "definitions.json#/define/title",
57
- "propertyOrder": 20
58
- },
59
- "description": {
60
- "$ref": "definitions.json#/define/description",
61
- "propertyOrder": 30,
62
- "format": "textarea"
63
- },
64
- "schema": {
65
- "$ref": "definitions.json#/define/schema",
66
- "propertyOrder": 40
67
- },
68
- "url": {
69
- "$ref": "definitions.json#/define/url",
70
- "propertyOrder": 50
71
- },
72
- "path": {
73
- "$ref": "definitions.json#/define/path",
74
- "propertyOrder": 60
75
- },
76
- "data": {
77
- "$ref": "definitions.json#/define/data",
78
- "propertyOrder": 70
79
- },
80
- "format": {
81
- "$ref": "definitions.json#/define/format",
82
- "propertyOrder": 80
83
- },
84
- "mediatype": {
85
- "$ref": "definitions.json#/define/mediatype",
86
- "propertyOrder": 90
87
- },
88
- "encoding": {
89
- "$ref": "definitions.json#/define/encoding",
90
- "propertyOrder": 100
91
- },
92
- "bytes": {
93
- "$ref": "definitions.json#/define/bytes",
94
- "propertyOrder": 110,
95
- "options": { "hidden": true }
96
- },
97
- "hash": {
98
- "$ref": "definitions.json#/define/hash",
99
- "propertyOrder": 120,
100
- "options": { "hidden": true }
101
- },
102
- "dialect": {
103
- "$ref": "definitions.json#/define/dialect",
104
- "propertyOrder": 130,
105
- "options": { "hidden": true }
106
- },
107
- "sources": {
108
- "$ref": "definitions.json#/define/sources",
109
- "propertyOrder": 140,
110
- "options": { "hidden": true }
111
- },
112
- "license": {
113
- "$ref": "definitions.json#/define/license",
114
- "description": "The license under which the resource is published.",
115
- "propertyOrder": 150,
116
- "options": { "hidden": true }
117
- }
118
- },
119
- "anyOf": [
120
- { "title": "url required", "required": ["url"] },
121
- { "title": "path required", "required": ["path"] },
122
- { "title": "data required", "required": ["data"] }
123
- ],
124
- "required": [ "schema" ]
125
- }
126
- },
127
- "keywords": {
128
- "$ref": "definitions.json#/define/keywords",
129
- "propertyOrder": 100
130
- },
131
- "sources": {
132
- "$ref": "definitions.json#/define/sources",
133
- "propertyOrder": 110,
134
- "options": { "hidden": true }
135
- },
136
- "image": {
137
- "$ref": "definitions.json#/define/image",
138
- "propertyOrder": 120,
139
- "options": { "hidden": true }
140
- },
141
- "dataDependencies": {
142
- "$ref": "definitions.json#/define/dataDependencies",
143
- "propertyOrder": 140,
144
- "options": { "hidden": true }
145
- }
146
- }
147
- }
File without changes
@@ -1,102 +0,0 @@
1
- import os
2
- import csv
3
- import urllib
4
- import unittest
5
-
6
- BASE_PATH = os.path.abspath(
7
- os.path.join(
8
- os.path.dirname(__file__),
9
- '..'
10
- )
11
- )
12
- REGISTRY_PATH = os.path.join(BASE_PATH, 'registry.csv')
13
-
14
-
15
- class TestRegistry(unittest.TestCase):
16
- def test_registry_has_the_expected_headers(self):
17
- expected_headers = (
18
- 'id',
19
- 'title',
20
- 'schema',
21
- 'schema_path',
22
- 'specification',
23
- )
24
-
25
- with open(REGISTRY_PATH, 'r', newline='') as f:
26
- headers = next(csv.reader(f))
27
-
28
- self.assertEqual(sorted(headers), sorted(expected_headers))
29
-
30
- def test_registry_schemas_have_the_required_attributes(self):
31
- required_attributes = (
32
- 'id',
33
- 'title',
34
- 'schema',
35
- 'schema_path',
36
- 'specification',
37
- )
38
-
39
- with open(REGISTRY_PATH, 'r', newline='') as f:
40
- registry = csv.DictReader(f)
41
- msg = "Schema '{0}' doesn't define required attribute '{1}'"
42
-
43
- for schema in registry:
44
- for key, value in schema.items():
45
- if key in required_attributes:
46
- assert value != '', msg.format(schema['id'], key)
47
-
48
- def test_registry_schemas_have_unique_ids(self):
49
- with open(REGISTRY_PATH, 'r', newline='') as f:
50
- registry = csv.DictReader(f)
51
- ids = [schema['id'] for schema in registry]
52
-
53
- assert len(ids) == len(set(ids)), "The schemas IDs aren't unique"
54
-
55
- def test_schema_paths_exist_and_are_files(self):
56
- with open(REGISTRY_PATH, 'r', newline='') as f:
57
- registry = csv.DictReader(f)
58
-
59
- for entry in registry:
60
- schema_path = entry['schema_path']
61
- msg = "schema_path '{0}' of schema '{1}' isn't a file"
62
- msg = msg.format(schema_path, entry['id'])
63
- path = os.path.join(BASE_PATH, schema_path)
64
- assert os.path.isfile(path), msg
65
-
66
- def test_schema_urls_exist(self):
67
- is_successful = lambda req: req.status >= 200 and req.status < 400
68
- is_redirect = lambda req: req.status >= 300 and req.status < 400
69
-
70
- with open(REGISTRY_PATH, 'r', newline='') as f:
71
- registry = csv.DictReader(f)
72
-
73
- for entry in registry:
74
- try:
75
- url = entry['schema']
76
- res = self._make_head_request(url)
77
- msg = "Error fetching schema_url '{0}' of schema '{1}'"
78
- msg = msg.format(url, entry['id'])
79
- assert (is_successful(res) or is_redirect(res)), msg
80
- except urllib.error.URLError as e:
81
- raise Exception(msg) from e
82
-
83
- def test_specification_urls_exist(self):
84
- is_successful = lambda req: req.status >= 200 and req.status < 400
85
- is_redirect = lambda req: req.status >= 300 and req.status < 400
86
-
87
- with open(REGISTRY_PATH, 'r', newline='') as f:
88
- registry = csv.DictReader(f)
89
-
90
- for entry in registry:
91
- try:
92
- url = entry['schema']
93
- res = self._make_head_request(url)
94
- msg = "Error fetching specification '{0}' of schema '{1}'"
95
- msg = msg.format(url, entry['id'])
96
- assert (is_successful(res) or is_redirect(res)), msg
97
- except urllib.error.URLError as e:
98
- raise Exception(msg) from e
99
-
100
- def _make_head_request(self, url):
101
- req = urllib.request.Request(url, method='HEAD')
102
- return urllib.request.urlopen(req)
@@ -1,41 +0,0 @@
1
- import os
2
- import glob
3
- import json
4
- import unittest
5
- import jsonschema
6
-
7
- BASE_PATH = os.path.abspath(
8
- os.path.join(
9
- os.path.dirname(__file__),
10
- '..'
11
- )
12
- )
13
-
14
-
15
- class TestSchemas(unittest.TestCase):
16
- def test_json_files_must_be_valid(self):
17
- json_glob = os.path.join(BASE_PATH, '*.json')
18
- json_paths = glob.glob(json_glob)
19
-
20
- for json_path in json_paths:
21
- try:
22
- with open(json_path, 'r') as f:
23
- json.load(f)
24
- except ValueError as e:
25
- msg = "File '{0}' isn\'t a valid JSON."
26
- raise ValueError(msg.format(json_path)) from e
27
-
28
- def test_json_files_must_be_valid_json_schemas(self):
29
- json_glob = os.path.join(BASE_PATH, '*.json')
30
- json_paths = glob.glob(json_glob)
31
-
32
- for json_path in json_paths:
33
- with open(json_path, 'r') as f:
34
- schema = json.load(f)
35
- try:
36
- validator_class = jsonschema.validators.validator_for(schema)
37
- validator = validator_class(schema)
38
- validator.check_schema(schema)
39
- except jsonschema.exceptions.SchemaError as e:
40
- msg = "File '{0}' isn\'t a valid JSON Schema."
41
- raise ValueError(msg.format(json_path)) from e
@@ -1,12 +0,0 @@
1
- module DataPackage
2
- class RegistryError < StandardError; end
3
-
4
- class SchemaException < Exception
5
- attr_reader :status, :message
6
-
7
- def initialize status
8
- @status = status
9
- @message = status
10
- end
11
- end
12
- end
@@ -1,181 +0,0 @@
1
- require 'open-uri'
2
-
3
- module DataPackage
4
- class Package < Hash
5
- attr_reader :opts, :errors
6
- attr_writer :resources
7
-
8
- # Parse or create a data package
9
- #
10
- # Supports reading data from JSON file, directory, and a URL
11
- #
12
- # package:: Hash or a String
13
- # schema:: Hash, Symbol or String
14
- # opts:: Options used to customize reading and parsing
15
- def initialize(package = nil, schema = :base, opts = {})
16
- @opts = opts
17
- @schema = DataPackage::Schema.new(schema || :base)
18
- @dead_resources = []
19
-
20
- self.merge! parse_package(package)
21
- define_properties!
22
- load_resources!
23
- end
24
-
25
- def parse_package(package)
26
- # TODO: base directory/url
27
- if package.nil?
28
- {}
29
- elsif package.class == Hash
30
- package
31
- else
32
- read_package(package)
33
- end
34
- end
35
-
36
- # Returns the directory for a local file package or base url for a remote
37
- # Returns nil for an in-memory object (because it has no base as yet)
38
- def base
39
- # user can override base
40
- return @opts[:base] if @opts[:base]
41
- return '' unless @location
42
- # work out base directory or uri
43
- if local?
44
- return File.dirname(@location)
45
- else
46
- return @location.split('/')[0..-2].join('/')
47
- end
48
- end
49
-
50
- # Is this a local package? Returns true if created from an in-memory object or a file/directory reference
51
- def local?
52
- return @local if @local
53
- return !@location.start_with?('http') if @location
54
- true
55
- end
56
-
57
- def resources
58
- update_resources!
59
- @resources
60
- end
61
-
62
- def property(property, default = nil)
63
- self[property] || default
64
- end
65
-
66
- def valid?
67
- validate
68
- @valid
69
- end
70
-
71
- def validate
72
- @errors = @schema.validation_errors(self)
73
- @valid = @schema.valid?(self)
74
- end
75
-
76
- def resource_exists?(location)
77
- @dead_resources.include?(location)
78
- end
79
-
80
- def to_json
81
- self.to_json
82
- end
83
-
84
- private
85
-
86
- def define_properties!
87
- (@schema['properties'] || {}).each do |k, v|
88
- next if k == 'resources'
89
- define_singleton_method("#{k.to_sym}=", proc { |p| set_property(k, p) })
90
- define_singleton_method(k.to_sym.to_s, proc { property k, default_value(v) })
91
- end
92
- end
93
-
94
- def load_resources!
95
- @resources = (self['resources'] || [])
96
- update_resources!
97
- end
98
-
99
- def update_resources!
100
- @resources.map! do |resource|
101
- begin
102
- load_resource(resource)
103
- rescue
104
- @dead_resources << resource['path']
105
- nil
106
- end
107
- end
108
- end
109
-
110
- def load_resource(resource)
111
- if resource.is_a?(Resource)
112
- resource
113
- else
114
- Resource.load(resource, base)
115
- end
116
- end
117
-
118
- def default_value(schema_data)
119
- case schema_data['type']
120
- when 'string'
121
- nil
122
- when 'array'
123
- []
124
- when 'object'
125
- {}
126
- end
127
- end
128
-
129
- def set_property(key, value)
130
- self[key] = value
131
- end
132
-
133
- def read_package(package)
134
- if is_directory?(package)
135
- package = File.join(package, opts[:default_filename] || 'datapackage.json')
136
- elsif is_containing_url?(package)
137
- package = URI.join(package, 'datapackage.json')
138
- end
139
-
140
- @location = package.to_s
141
-
142
- if File.extname(package.to_s) == '.zip'
143
- unzip_package(package)
144
- else
145
- JSON.parse open(package).read
146
- end
147
- end
148
-
149
- def is_directory?(package)
150
- !package.start_with?('http') && File.directory?(package)
151
- end
152
-
153
- def is_containing_url?(package)
154
- package.start_with?('http') && !package.end_with?('datapackage.json', 'datapackage.zip')
155
- end
156
-
157
- def write_to_tempfile(url)
158
- tempfile = Tempfile.new('datapackage')
159
- tempfile.write(open(url).read)
160
- tempfile.rewind
161
- tempfile
162
- end
163
-
164
- def unzip_package(package)
165
- package = write_to_tempfile(package) if package.start_with?('http')
166
- dir = Dir.mktmpdir
167
- Zip::File.open(package) do |zip_file|
168
- # Extract all the files
169
- zip_file.each { |entry| entry.extract("#{dir}/#{File.basename entry.name}") }
170
- # Get and parse the datapackage metadata
171
- entry = zip_file.glob("*/#{opts[:default_filename] || 'datapackage.json'}").first
172
- package = JSON.parse(entry.get_input_stream.read)
173
- end
174
- # Set the base dir to the directory we unzipped to
175
- @opts[:base] = dir
176
- # This is now a local file, not a URL
177
- @local = true
178
- package
179
- end
180
- end
181
- end
@@ -1,81 +0,0 @@
1
- module DataPackage
2
- ##
3
- # Allow loading Data Package profiles from a registry.
4
-
5
- class Registry
6
-
7
- DEFAULT_REGISTRY_URL = 'http://schemas.datapackages.org/registry.csv'
8
- DEFAULT_REGISTRY_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '..', '..', 'datapackage', 'schemas', 'registry.csv')
9
-
10
- attr_reader :base_path
11
-
12
- def initialize(registry_path_or_url = DEFAULT_REGISTRY_PATH)
13
- registry_path_or_url ||= DEFAULT_REGISTRY_PATH
14
- if File.file?(registry_path_or_url)
15
- @base_path = File.dirname(
16
- File.absolute_path(registry_path_or_url)
17
- )
18
- end
19
- @profiles = {}
20
- @registry = get_registry(registry_path_or_url)
21
- end
22
-
23
- def get(profile_id)
24
- @profiles[profile_id] ||= get_profile(profile_id)
25
- end
26
-
27
- def available_profiles
28
- @registry
29
- end
30
-
31
- private
32
-
33
- def get_profile(profile_id)
34
- profile_metadata = @registry[profile_id]
35
- return if profile_metadata.nil?
36
-
37
- path = get_absolute_path(profile_metadata[:schema_path])
38
-
39
- if path && File.file?(path)
40
- load_json(path)
41
- else
42
- url = profile_metadata[:schema]
43
- load_json(url)
44
- end
45
- end
46
-
47
- def get_registry(registry_path_or_url)
48
- begin
49
- csv = parse_csv(registry_path_or_url)
50
- registry = {}
51
- csv.each { |row| registry[row.fetch(:id)] = Hash[row.headers.zip(row.fields)] }
52
- rescue KeyError, OpenURI::HTTPError, Errno::ENOENT
53
- raise(RegistryError)
54
- end
55
- registry
56
- end
57
-
58
- def parse_csv(path_or_url)
59
- csv = open(path_or_url).read
60
- if csv.match(/,/)
61
- CSV.new(csv, headers: :first_row, header_converters: :symbol)
62
- else
63
- raise RegistryError
64
- end
65
- end
66
-
67
- def get_absolute_path(relative_path)
68
- File.join(@base_path, relative_path)
69
- rescue TypeError
70
- nil
71
- end
72
-
73
- def load_json(path)
74
- json = open(path).read
75
- JSON.parse(json)
76
- rescue JSON::ParserError, OpenURI::HTTPError
77
- raise RegistryError
78
- end
79
-
80
- end
81
- end
@@ -1,83 +0,0 @@
1
- module DataPackage
2
- class Resource < Hash
3
-
4
- def initialize(resource, base_path = '')
5
- self.merge! resource
6
- end
7
-
8
- def self.load(resource, base_path = '')
9
- # This returns if there are no alternative ways to access the data OR there
10
- # is a base_path which is a URL
11
- if is_url?(resource, base_path)
12
- RemoteResource.new(resource, base_path)
13
- else
14
- # If there's a data attribute, we definitely want an inline resource
15
- if resource['data']
16
- InlineResource.new(resource)
17
- else
18
- # If the file exists - we want a local resource
19
- if file_exists?(resource, base_path)
20
- LocalResource.new(resource, base_path)
21
- # If it doesn't exist and there's a URL to grab the data from, we want
22
- # a remote resource
23
- elsif resource['url']
24
- RemoteResource.new(resource, base_path)
25
- end
26
- end
27
- end
28
- end
29
-
30
- def self.file_exists?(resource, base_path)
31
- path = resource['path']
32
- path = File.join(base_path, path) if base_path != ''
33
- File.exists?(path)
34
- end
35
-
36
- def self.is_url?(resource, base_path)
37
- return true if resource['url'] != nil && resource['path'] == nil && resource['data'] == nil
38
- return true if base_path.start_with?('http')
39
- end
40
-
41
- def table
42
- @table ||= JsonTableSchema::Table.new(CSV.parse(data), self['schema']) if self['schema']
43
- end
44
-
45
- end
46
-
47
- class LocalResource < Resource
48
-
49
- def initialize(resource, base_path = '')
50
- @base_path = base_path
51
- @path = resource['path']
52
- super
53
- end
54
-
55
- def data
56
- @path = File.join(@base_path, @path) if @base_path != ''
57
- open(@path).read
58
- end
59
-
60
- end
61
-
62
- class InlineResource < Resource
63
- def data
64
- self['data']
65
- end
66
- end
67
-
68
- class RemoteResource < Resource
69
-
70
- def initialize(resource, base_url = '')
71
- @base_url = base_url
72
- @url = resource['url']
73
- @path = resource['path']
74
- super
75
- end
76
-
77
- def data
78
- url = @url ? @url : URI.join(@base_url, @path)
79
- open(url).read
80
- end
81
-
82
- end
83
- end