imw 0.2.16 → 0.2.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/imw/dataset.rb +1 -1
- data/lib/imw/formats/delimited.rb +9 -14
- data/lib/imw/metadata.rb +38 -21
- data/lib/imw/metadata/contains_metadata.rb +35 -25
- data/lib/imw/metadata/field.rb +0 -28
- data/lib/imw/metadata/has_metadata.rb +93 -0
- data/lib/imw/metadata/has_summary.rb +51 -0
- data/lib/imw/metadata/schema.rb +6 -216
- data/lib/imw/resource.rb +2 -5
- data/lib/imw/schemes/http.rb +1 -1
- data/lib/imw/schemes/local.rb +18 -46
- data/lib/imw/schemes/sql.rb +12 -0
- data/lib/imw/tools/summarizer.rb +12 -13
- data/spec/imw/formats/delimited_spec.rb +3 -12
- data/spec/imw/metadata/contains_metadata_spec.rb +56 -0
- data/spec/imw/metadata/field_spec.rb +4 -5
- data/spec/imw/metadata/has_metadata_spec.rb +58 -0
- data/spec/imw/metadata/has_summary_spec.rb +32 -0
- data/spec/imw/metadata/schema_spec.rb +10 -13
- data/spec/imw/metadata_spec.rb +68 -21
- data/spec/imw/schemes/local_spec.rb +12 -22
- data/spec/imw/schemes/s3_spec.rb +0 -1
- metadata +12 -5
- data/lib/imw/metadata/schematized.rb +0 -27
data/lib/imw/resource.rb
CHANGED
@@ -73,23 +73,20 @@ module IMW
|
|
73
73
|
#
|
74
74
|
# @param [String, Addressable::URI] uri
|
75
75
|
# @param [Hash] options
|
76
|
-
# @option options [true, false] no_modules
|
77
76
|
# @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
|
78
|
-
# @option options [IMW::Metadata::Record, Array] schema the schema of this resource
|
79
77
|
# @return [IMW::Resource]
|
80
78
|
def initialize uri, options={}
|
81
79
|
self.uri = uri
|
82
80
|
self.resource_options = options
|
83
81
|
self.mode = options[:mode] || 'r'
|
84
|
-
self.schema = options[:schema] if options[:schema]
|
85
82
|
extend_appropriately!(options)
|
86
83
|
end
|
87
84
|
|
88
85
|
# Provides resources with a wrapped Addressable::URI object.
|
89
86
|
include IMW::Utils::HasURI
|
90
87
|
|
91
|
-
# Provides resources with a schema.
|
92
|
-
include IMW::Metadata::
|
88
|
+
# Provides resources with a summary, metadata, & schema.
|
89
|
+
include IMW::Metadata::HasSummary
|
93
90
|
|
94
91
|
# Gives IMW::Resource instances with the ability to dynamically
|
95
92
|
# extend themselves with modules chosen from a set of handlers
|
data/lib/imw/schemes/http.rb
CHANGED
data/lib/imw/schemes/local.rb
CHANGED
@@ -71,6 +71,8 @@ module IMW
|
|
71
71
|
# Defines methods for appropriate for a local file.
|
72
72
|
module LocalFile
|
73
73
|
|
74
|
+
include IMW::Metadata::HasMetadata
|
75
|
+
|
74
76
|
# Is this resource a regular file?
|
75
77
|
#
|
76
78
|
# @return [true, false]
|
@@ -173,7 +175,7 @@ module IMW
|
|
173
175
|
def snippet
|
174
176
|
returning([]) do |snip|
|
175
177
|
(io.read(1024) || '').bytes.each do |byte|
|
176
|
-
|
178
|
+
# CR LF SPACE ~
|
177
179
|
snip << byte.chr if byte == 13 || byte == 10 || byte >= 32 && byte <= 126
|
178
180
|
end
|
179
181
|
end.join
|
@@ -206,19 +208,17 @@ module IMW
|
|
206
208
|
# - basename
|
207
209
|
# - size
|
208
210
|
# - extension
|
209
|
-
# -
|
210
|
-
def
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
:num_lines => num_lines
|
216
|
-
}
|
217
|
-
data[:snippet] = snippet if respond_to?(:snippet)
|
218
|
-
data[:schema] = schema if respond_to?(:schema)
|
219
|
-
data
|
211
|
+
# - num_lines
|
212
|
+
def external_summary
|
213
|
+
super().merge({
|
214
|
+
:size => size,
|
215
|
+
:num_lines => num_lines
|
216
|
+
})
|
220
217
|
end
|
221
218
|
|
219
|
+
|
220
|
+
|
221
|
+
|
222
222
|
protected
|
223
223
|
|
224
224
|
# Return a triple of line, word, and character counts for this
|
@@ -231,7 +231,7 @@ module IMW
|
|
231
231
|
@wc ||= begin
|
232
232
|
`wc #{path}`.chomp.strip.split.map(&:to_i)
|
233
233
|
rescue
|
234
|
-
[
|
234
|
+
[nil,nil,nil] # FIXME
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
@@ -388,44 +388,16 @@ module IMW
|
|
388
388
|
end
|
389
389
|
end
|
390
390
|
|
391
|
-
# Return a hash summarizing this directory with a key
|
392
|
-
# <tt>:contents</tt> containing an array of hashes summarizing
|
393
|
-
# this directories contents.
|
394
|
-
#
|
395
391
|
# The directory summary includes the following information
|
396
|
-
# - basename
|
397
392
|
# - size
|
398
393
|
# - num_files
|
399
|
-
# - contents
|
400
394
|
#
|
401
395
|
# @return [Hash]
|
402
|
-
def
|
403
|
-
{
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
:contents => resources.map do |resource|
|
408
|
-
resource.guess_schema! if guess_schema? && resource.respond_to?(:guess_schema!)
|
409
|
-
resource_summary = resource.summary
|
410
|
-
resource_summary[:schema] = metadata[resource] if metadata && metadata.describe?(resource) # this should be handled by 'resources' method above
|
411
|
-
resource_summary
|
412
|
-
end
|
413
|
-
}
|
414
|
-
end
|
415
|
-
|
416
|
-
# Whether or not to have this directory's resources guess
|
417
|
-
# their schemas when none is provided.
|
418
|
-
#
|
419
|
-
# @return [true, false]
|
420
|
-
def guess_schema?
|
421
|
-
(!! @guess_schema)
|
422
|
-
end
|
423
|
-
|
424
|
-
# Force this directory's resources to guess at their schema.
|
425
|
-
#
|
426
|
-
# @return [true]
|
427
|
-
def guess_schema!
|
428
|
-
@guess_schema = true
|
396
|
+
def external_summary
|
397
|
+
super().merge({
|
398
|
+
:size => size,
|
399
|
+
:num_files => contents.length,
|
400
|
+
})
|
429
401
|
end
|
430
402
|
|
431
403
|
end
|
data/lib/imw/schemes/sql.rb
CHANGED
@@ -46,6 +46,18 @@ module IMW
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
# Return a summary of this database.
|
50
|
+
#
|
51
|
+
# Purposefully does not call +super+.
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
def external_summary
|
55
|
+
{
|
56
|
+
:uri => uri.to_s,
|
57
|
+
:database => database
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
49
61
|
# The (cached) database connection for this resource.
|
50
62
|
#
|
51
63
|
# @return [DBI::DatabaseHandle]
|
data/lib/imw/tools/summarizer.rb
CHANGED
@@ -43,26 +43,25 @@ module IMW
|
|
43
43
|
|
44
44
|
# Return a summary of the +inputs+ to this Summarizer.
|
45
45
|
#
|
46
|
-
#
|
47
|
-
# IMW::Resource in +inputs+.
|
46
|
+
# Will swallow errors.
|
48
47
|
#
|
49
48
|
# @return [Array<Hash>]
|
50
49
|
def summary
|
51
|
-
@summary ||=
|
52
|
-
#input.guess_schema! if input.schema.nil? && input.respond_to?(:guess_schema!)
|
53
|
-
(input.respond_to?(:summary) ? input.summary : {}) rescue {}
|
54
|
-
end
|
50
|
+
@summary ||= summary! rescue []
|
55
51
|
end
|
56
52
|
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
53
|
+
# Return a summary of the +inputs+ to this summarizer.
|
54
|
+
#
|
55
|
+
# Delegates to the +summary+ method of each constituent
|
56
|
+
# IMW::Resource in +inputs+.
|
60
57
|
#
|
61
|
-
# @return [
|
62
|
-
def
|
63
|
-
|
58
|
+
# @return [Array]
|
59
|
+
def summary!
|
60
|
+
inputs.map do |input|
|
61
|
+
(input.respond_to?(:summary) ? input.summary : {})
|
62
|
+
end
|
64
63
|
end
|
65
|
-
|
64
|
+
|
66
65
|
protected
|
67
66
|
# Set new inputs for this summarizer.
|
68
67
|
#
|
@@ -18,32 +18,23 @@ describe IMW::Formats::Csv do
|
|
18
18
|
IMW.open('test.csv').load[1].last.should == "4"
|
19
19
|
end
|
20
20
|
|
21
|
-
it "should raise an error on an invalid schema" do
|
22
|
-
lambda { @sample.schema = [{:name => :foobar, :has_many => {:associations => [:foo, :bar]}}] }.should raise_error(IMW::SchemaError)
|
23
|
-
end
|
24
|
-
|
25
|
-
it "should accept a valid schema" do
|
26
|
-
@sample.schema = [:foo, :bar, :baz]
|
27
|
-
@sample.schema.should == [{:name => 'foo'}, {:name => 'bar'}, {:name => 'baz'}]
|
28
|
-
end
|
29
|
-
|
30
21
|
describe "guessing a schema" do
|
31
22
|
|
32
23
|
Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].each do |path|
|
33
24
|
it "should correctly guess that with_schema/#{File.basename(path)} has headers in its first row" do
|
34
|
-
IMW.open(path).
|
25
|
+
IMW.open(path).fields_in_first_line?.should == true
|
35
26
|
end
|
36
27
|
end
|
37
28
|
|
38
29
|
Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/without_schema/*')].each do |path|
|
39
30
|
it "should correctly guess that without_schema/#{File.basename(path)} does not have headers in its first row" do
|
40
|
-
IMW.open(path).
|
31
|
+
IMW.open(path).fields_in_first_line?.should == false
|
41
32
|
end
|
42
33
|
end
|
43
34
|
|
44
35
|
it "should automatically set the headers on a source with guessed headers" do
|
45
36
|
resource = IMW.open(Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].first)
|
46
|
-
resource.
|
37
|
+
resource.guess_fields!
|
47
38
|
resource.delimited_options[:headers].class.should == Array
|
48
39
|
resource.schema.should_not be_empty
|
49
40
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::ContainsMetadata do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
attr_accessor :contents
|
8
|
+
def path ; IMWTest::TMP_DIR ; end
|
9
|
+
def basename ; File.basename(IMWTest::TMP_DIR) ; end
|
10
|
+
include IMW::Metadata::ContainsMetadata
|
11
|
+
end
|
12
|
+
@foo = Foo.new
|
13
|
+
@foo.contents = []
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'finding the default metadata URI' do
|
17
|
+
it "should return the default metadata URI when 'contents' is empty" do
|
18
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, File.basename(IMWTest::TMP_DIR) + ".icss.yaml")
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return the default metadata URI when 'contents' doesn't contain any metadata files" do
|
22
|
+
@foo.contents.concat ['bar.txt', 'crazy_file.yaml', 'foo.json'].map { |p| File.join(IMWTest::TMP_DIR, p) }
|
23
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, File.basename(IMWTest::TMP_DIR) + ".icss.yaml")
|
24
|
+
end
|
25
|
+
|
26
|
+
%w[my-projects.icss.yaml stupid-crazy-fool-of-a-dataset-icss.json foobar-25.metadata.buzz.yml].each do |basename|
|
27
|
+
it "should return the metadata URI when 'contents' contains a URI matching '#{basename}'" do
|
28
|
+
@foo.contents.concat ['bar.txt', 'crazy_file.yaml', 'foo.json', basename].map { |p| File.join(IMWTest::TMP_DIR, p) }
|
29
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, basename)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
describe 'returning its metadata' do
|
36
|
+
it "should return 'nil' when no metadata exists on disk" do
|
37
|
+
@foo.metadata.should be_nil
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return Metadata when metadata exists on disk" do
|
41
|
+
IMW.open!(@foo.default_metadata_uri) do |f|
|
42
|
+
f.write <<YAML
|
43
|
+
---
|
44
|
+
foo:
|
45
|
+
description: bar
|
46
|
+
fields: baz
|
47
|
+
YAML
|
48
|
+
end
|
49
|
+
@foo.metadata.class.should == IMW::Metadata
|
50
|
+
@foo.metadata['foo']['description'].should == 'bar'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
end
|
@@ -3,17 +3,16 @@ require File.dirname(__FILE__) + "/../../spec_helper"
|
|
3
3
|
describe IMW::Metadata::Field do
|
4
4
|
|
5
5
|
describe "initializing" do
|
6
|
-
it "should parse a
|
7
|
-
IMW::Metadata::Field.new(
|
8
|
-
IMW::Metadata::Field.new('foobar').should == { :name => 'foobar' }
|
6
|
+
it "should parse a string into a hash" do
|
7
|
+
IMW::Metadata::Field.new('foobar').should == { "name" => 'foobar' }
|
9
8
|
end
|
10
9
|
|
11
10
|
it "should raise an error on a Hash without a :name key" do
|
12
|
-
lambda { IMW::Metadata::Field.new(
|
11
|
+
lambda { IMW::Metadata::Field.new('foo' => 'bar') }.should raise_error(IMW::ArgumentError)
|
13
12
|
end
|
14
13
|
|
15
14
|
it "should accept a Hash with a :name key" do
|
16
|
-
data = {
|
15
|
+
data = { 'name' => :foobar, 'title' => "Bazbooz", 'unit' => "m" }
|
17
16
|
IMW::Metadata::Field.new(data).should == data
|
18
17
|
end
|
19
18
|
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::HasMetadata do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
def uri ; File.join(IMWTest::TMP_DIR, 'test', 'subdir', 'foobar.csv') ; end
|
8
|
+
def basename ; File.basename(uri) ; end
|
9
|
+
def extension ; 'csv' ; end
|
10
|
+
def dir ; IMW.open(File.join(IMWTest::TMP_DIR, 'test', 'subdir')) ; end
|
11
|
+
include IMW::Metadata::HasMetadata
|
12
|
+
end
|
13
|
+
@foo = Foo.new
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should be able to build a schema" do
|
17
|
+
@foo.schema.should include(:type, :namespace, :name, :doc, :fields, :non_avro)
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "finding its metadata" do
|
21
|
+
|
22
|
+
before do
|
23
|
+
FileUtils.mkdir_p(@foo.dir.path)
|
24
|
+
IMWTest::Random.file(File.join(@foo.dir.path, 'foobar.csv'))
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return 'nil' when it can't find any metadata" do
|
28
|
+
@foo.metadata.should be_nil
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should return 'nil' when a metadata file is found that doesn't describe it" do
|
32
|
+
IMW.open!("has_metadata_test.icss.yaml") do |f|
|
33
|
+
f.write <<YAML
|
34
|
+
---
|
35
|
+
foobar.csv:
|
36
|
+
description: bar
|
37
|
+
fields: ["baz", "booz"]
|
38
|
+
YAML
|
39
|
+
end
|
40
|
+
@foo.metadata.should be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
# it "should return the metadata when a metadata file is found that does describe it" do
|
44
|
+
# IMW.open!("has_metadata_test.icss.yaml") do |f|
|
45
|
+
# f.write <<YAML
|
46
|
+
# ---
|
47
|
+
# #{IMWTest::TMP_DIR}/test/subdir/foobar.csv:
|
48
|
+
# description: bar
|
49
|
+
# fields: ["baz", "booz"]
|
50
|
+
# YAML
|
51
|
+
# end
|
52
|
+
# @foo.metadata.class.should == IMW::Metadata
|
53
|
+
# @foo.metadata[@foo]['description'].should == 'bar'
|
54
|
+
# end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::HasSummary do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
def initialize(*args) ; @args = args ; end
|
8
|
+
def uri ; File.join(IMWTest::TMP_DIR, *@args) ; end
|
9
|
+
def basename ; File.basename(uri) ; end
|
10
|
+
def extension ; File.extname(@args.last || '').gsub(/^\./,'') ; end
|
11
|
+
include IMW::Metadata::HasSummary
|
12
|
+
end
|
13
|
+
@foo = Foo.new('foo', 'bar.csv')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should build a summary from an external summary" do
|
17
|
+
@foo.summary.should include(:uri, :basename, :extension)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should build a summary from an external summary and a schema when possible" do
|
21
|
+
@foo.stub!(:schema).and_return({:foo => 'bar'})
|
22
|
+
@foo.summary[:schema].should == {:foo => 'bar'}
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be able to build an external summary describing how it's situated in the world" do
|
26
|
+
@foo.summary[:uri].should == File.join(IMWTest::TMP_DIR, 'foo', 'bar.csv')
|
27
|
+
@foo.summary[:basename].should == 'bar.csv'
|
28
|
+
@foo.summary[:extension].should == 'csv'
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
@@ -3,25 +3,22 @@ require File.dirname(__FILE__) + "/../../spec_helper"
|
|
3
3
|
describe IMW::Metadata::Schema do
|
4
4
|
|
5
5
|
describe "initializing" do
|
6
|
-
it "should
|
7
|
-
IMW::Metadata::Schema.new(
|
6
|
+
it "should merge with a Hash" do
|
7
|
+
IMW::Metadata::Schema.new({:foo => 'foobar'}).should == { :foo => 'foobar' }
|
8
8
|
end
|
9
9
|
|
10
|
-
it "should
|
11
|
-
|
12
|
-
IMW::Metadata::Schema.new(orig_schema).should == orig_schema
|
10
|
+
it "should merge with a Schema" do
|
11
|
+
IMW::Metadata::Schema.new(IMW::Metadata::Schema.new({:foo => 'foobar'})).should == { :foo => 'foobar' }
|
13
12
|
end
|
14
|
-
end
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
resource = IMW.open('some_resource')
|
19
|
-
resource.should_receive(:load).and_return(%w[foo bar baz])
|
20
|
-
IMW.should_receive(:open).and_return(resource)
|
21
|
-
IMW::Metadata::Schema.load(resource.to_s).map { |field| field[:name] }.should == %w[foo bar baz]
|
14
|
+
it "should ignore anything else" do
|
15
|
+
IMW::Metadata::Schema.new('foobar').should == {}
|
22
16
|
end
|
23
17
|
|
18
|
+
it "should accept empty args" do
|
19
|
+
IMW::Metadata::Schema.new.should == {}
|
20
|
+
end
|
21
|
+
|
24
22
|
end
|
25
23
|
|
26
24
|
end
|
27
|
-
|