imw 0.2.16 → 0.2.17
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/imw/dataset.rb +1 -1
- data/lib/imw/formats/delimited.rb +9 -14
- data/lib/imw/metadata.rb +38 -21
- data/lib/imw/metadata/contains_metadata.rb +35 -25
- data/lib/imw/metadata/field.rb +0 -28
- data/lib/imw/metadata/has_metadata.rb +93 -0
- data/lib/imw/metadata/has_summary.rb +51 -0
- data/lib/imw/metadata/schema.rb +6 -216
- data/lib/imw/resource.rb +2 -5
- data/lib/imw/schemes/http.rb +1 -1
- data/lib/imw/schemes/local.rb +18 -46
- data/lib/imw/schemes/sql.rb +12 -0
- data/lib/imw/tools/summarizer.rb +12 -13
- data/spec/imw/formats/delimited_spec.rb +3 -12
- data/spec/imw/metadata/contains_metadata_spec.rb +56 -0
- data/spec/imw/metadata/field_spec.rb +4 -5
- data/spec/imw/metadata/has_metadata_spec.rb +58 -0
- data/spec/imw/metadata/has_summary_spec.rb +32 -0
- data/spec/imw/metadata/schema_spec.rb +10 -13
- data/spec/imw/metadata_spec.rb +68 -21
- data/spec/imw/schemes/local_spec.rb +12 -22
- data/spec/imw/schemes/s3_spec.rb +0 -1
- metadata +12 -5
- data/lib/imw/metadata/schematized.rb +0 -27
data/lib/imw/resource.rb
CHANGED
@@ -73,23 +73,20 @@ module IMW
|
|
73
73
|
#
|
74
74
|
# @param [String, Addressable::URI] uri
|
75
75
|
# @param [Hash] options
|
76
|
-
# @option options [true, false] no_modules
|
77
76
|
# @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
|
78
|
-
# @option options [IMW::Metadata::Record, Array] schema the schema of this resource
|
79
77
|
# @return [IMW::Resource]
|
80
78
|
def initialize uri, options={}
|
81
79
|
self.uri = uri
|
82
80
|
self.resource_options = options
|
83
81
|
self.mode = options[:mode] || 'r'
|
84
|
-
self.schema = options[:schema] if options[:schema]
|
85
82
|
extend_appropriately!(options)
|
86
83
|
end
|
87
84
|
|
88
85
|
# Provides resources with a wrapped Addressable::URI object.
|
89
86
|
include IMW::Utils::HasURI
|
90
87
|
|
91
|
-
# Provides resources with a schema.
|
92
|
-
include IMW::Metadata::
|
88
|
+
# Provides resources with a summary, metadata, & schema.
|
89
|
+
include IMW::Metadata::HasSummary
|
93
90
|
|
94
91
|
# Gives IMW::Resource instances with the ability to dynamically
|
95
92
|
# extend themselves with modules chosen from a set of handlers
|
data/lib/imw/schemes/http.rb
CHANGED
data/lib/imw/schemes/local.rb
CHANGED
@@ -71,6 +71,8 @@ module IMW
|
|
71
71
|
# Defines methods for appropriate for a local file.
|
72
72
|
module LocalFile
|
73
73
|
|
74
|
+
include IMW::Metadata::HasMetadata
|
75
|
+
|
74
76
|
# Is this resource a regular file?
|
75
77
|
#
|
76
78
|
# @return [true, false]
|
@@ -173,7 +175,7 @@ module IMW
|
|
173
175
|
def snippet
|
174
176
|
returning([]) do |snip|
|
175
177
|
(io.read(1024) || '').bytes.each do |byte|
|
176
|
-
|
178
|
+
# CR LF SPACE ~
|
177
179
|
snip << byte.chr if byte == 13 || byte == 10 || byte >= 32 && byte <= 126
|
178
180
|
end
|
179
181
|
end.join
|
@@ -206,19 +208,17 @@ module IMW
|
|
206
208
|
# - basename
|
207
209
|
# - size
|
208
210
|
# - extension
|
209
|
-
# -
|
210
|
-
def
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
:num_lines => num_lines
|
216
|
-
}
|
217
|
-
data[:snippet] = snippet if respond_to?(:snippet)
|
218
|
-
data[:schema] = schema if respond_to?(:schema)
|
219
|
-
data
|
211
|
+
# - num_lines
|
212
|
+
def external_summary
|
213
|
+
super().merge({
|
214
|
+
:size => size,
|
215
|
+
:num_lines => num_lines
|
216
|
+
})
|
220
217
|
end
|
221
218
|
|
219
|
+
|
220
|
+
|
221
|
+
|
222
222
|
protected
|
223
223
|
|
224
224
|
# Return a triple of line, word, and character counts for this
|
@@ -231,7 +231,7 @@ module IMW
|
|
231
231
|
@wc ||= begin
|
232
232
|
`wc #{path}`.chomp.strip.split.map(&:to_i)
|
233
233
|
rescue
|
234
|
-
[
|
234
|
+
[nil,nil,nil] # FIXME
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
@@ -388,44 +388,16 @@ module IMW
|
|
388
388
|
end
|
389
389
|
end
|
390
390
|
|
391
|
-
# Return a hash summarizing this directory with a key
|
392
|
-
# <tt>:contents</tt> containing an array of hashes summarizing
|
393
|
-
# this directories contents.
|
394
|
-
#
|
395
391
|
# The directory summary includes the following information
|
396
|
-
# - basename
|
397
392
|
# - size
|
398
393
|
# - num_files
|
399
|
-
# - contents
|
400
394
|
#
|
401
395
|
# @return [Hash]
|
402
|
-
def
|
403
|
-
{
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
:contents => resources.map do |resource|
|
408
|
-
resource.guess_schema! if guess_schema? && resource.respond_to?(:guess_schema!)
|
409
|
-
resource_summary = resource.summary
|
410
|
-
resource_summary[:schema] = metadata[resource] if metadata && metadata.describe?(resource) # this should be handled by 'resources' method above
|
411
|
-
resource_summary
|
412
|
-
end
|
413
|
-
}
|
414
|
-
end
|
415
|
-
|
416
|
-
# Whether or not to have this directory's resources guess
|
417
|
-
# their schemas when none is provided.
|
418
|
-
#
|
419
|
-
# @return [true, false]
|
420
|
-
def guess_schema?
|
421
|
-
(!! @guess_schema)
|
422
|
-
end
|
423
|
-
|
424
|
-
# Force this directory's resources to guess at their schema.
|
425
|
-
#
|
426
|
-
# @return [true]
|
427
|
-
def guess_schema!
|
428
|
-
@guess_schema = true
|
396
|
+
def external_summary
|
397
|
+
super().merge({
|
398
|
+
:size => size,
|
399
|
+
:num_files => contents.length,
|
400
|
+
})
|
429
401
|
end
|
430
402
|
|
431
403
|
end
|
data/lib/imw/schemes/sql.rb
CHANGED
@@ -46,6 +46,18 @@ module IMW
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
# Return a summary of this database.
|
50
|
+
#
|
51
|
+
# Purposefully does not call +super+.
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
def external_summary
|
55
|
+
{
|
56
|
+
:uri => uri.to_s,
|
57
|
+
:database => database
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
49
61
|
# The (cached) database connection for this resource.
|
50
62
|
#
|
51
63
|
# @return [DBI::DatabaseHandle]
|
data/lib/imw/tools/summarizer.rb
CHANGED
@@ -43,26 +43,25 @@ module IMW
|
|
43
43
|
|
44
44
|
# Return a summary of the +inputs+ to this Summarizer.
|
45
45
|
#
|
46
|
-
#
|
47
|
-
# IMW::Resource in +inputs+.
|
46
|
+
# Will swallow errors.
|
48
47
|
#
|
49
48
|
# @return [Array<Hash>]
|
50
49
|
def summary
|
51
|
-
@summary ||=
|
52
|
-
#input.guess_schema! if input.schema.nil? && input.respond_to?(:guess_schema!)
|
53
|
-
(input.respond_to?(:summary) ? input.summary : {}) rescue {}
|
54
|
-
end
|
50
|
+
@summary ||= summary! rescue []
|
55
51
|
end
|
56
52
|
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
53
|
+
# Return a summary of the +inputs+ to this summarizer.
|
54
|
+
#
|
55
|
+
# Delegates to the +summary+ method of each constituent
|
56
|
+
# IMW::Resource in +inputs+.
|
60
57
|
#
|
61
|
-
# @return [
|
62
|
-
def
|
63
|
-
|
58
|
+
# @return [Array]
|
59
|
+
def summary!
|
60
|
+
inputs.map do |input|
|
61
|
+
(input.respond_to?(:summary) ? input.summary : {})
|
62
|
+
end
|
64
63
|
end
|
65
|
-
|
64
|
+
|
66
65
|
protected
|
67
66
|
# Set new inputs for this summarizer.
|
68
67
|
#
|
@@ -18,32 +18,23 @@ describe IMW::Formats::Csv do
|
|
18
18
|
IMW.open('test.csv').load[1].last.should == "4"
|
19
19
|
end
|
20
20
|
|
21
|
-
it "should raise an error on an invalid schema" do
|
22
|
-
lambda { @sample.schema = [{:name => :foobar, :has_many => {:associations => [:foo, :bar]}}] }.should raise_error(IMW::SchemaError)
|
23
|
-
end
|
24
|
-
|
25
|
-
it "should accept a valid schema" do
|
26
|
-
@sample.schema = [:foo, :bar, :baz]
|
27
|
-
@sample.schema.should == [{:name => 'foo'}, {:name => 'bar'}, {:name => 'baz'}]
|
28
|
-
end
|
29
|
-
|
30
21
|
describe "guessing a schema" do
|
31
22
|
|
32
23
|
Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].each do |path|
|
33
24
|
it "should correctly guess that with_schema/#{File.basename(path)} has headers in its first row" do
|
34
|
-
IMW.open(path).
|
25
|
+
IMW.open(path).fields_in_first_line?.should == true
|
35
26
|
end
|
36
27
|
end
|
37
28
|
|
38
29
|
Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/without_schema/*')].each do |path|
|
39
30
|
it "should correctly guess that without_schema/#{File.basename(path)} does not have headers in its first row" do
|
40
|
-
IMW.open(path).
|
31
|
+
IMW.open(path).fields_in_first_line?.should == false
|
41
32
|
end
|
42
33
|
end
|
43
34
|
|
44
35
|
it "should automatically set the headers on a source with guessed headers" do
|
45
36
|
resource = IMW.open(Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].first)
|
46
|
-
resource.
|
37
|
+
resource.guess_fields!
|
47
38
|
resource.delimited_options[:headers].class.should == Array
|
48
39
|
resource.schema.should_not be_empty
|
49
40
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::ContainsMetadata do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
attr_accessor :contents
|
8
|
+
def path ; IMWTest::TMP_DIR ; end
|
9
|
+
def basename ; File.basename(IMWTest::TMP_DIR) ; end
|
10
|
+
include IMW::Metadata::ContainsMetadata
|
11
|
+
end
|
12
|
+
@foo = Foo.new
|
13
|
+
@foo.contents = []
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'finding the default metadata URI' do
|
17
|
+
it "should return the default metadata URI when 'contents' is empty" do
|
18
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, File.basename(IMWTest::TMP_DIR) + ".icss.yaml")
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return the default metadata URI when 'contents' doesn't contain any metadata files" do
|
22
|
+
@foo.contents.concat ['bar.txt', 'crazy_file.yaml', 'foo.json'].map { |p| File.join(IMWTest::TMP_DIR, p) }
|
23
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, File.basename(IMWTest::TMP_DIR) + ".icss.yaml")
|
24
|
+
end
|
25
|
+
|
26
|
+
%w[my-projects.icss.yaml stupid-crazy-fool-of-a-dataset-icss.json foobar-25.metadata.buzz.yml].each do |basename|
|
27
|
+
it "should return the metadata URI when 'contents' contains a URI matching '#{basename}'" do
|
28
|
+
@foo.contents.concat ['bar.txt', 'crazy_file.yaml', 'foo.json', basename].map { |p| File.join(IMWTest::TMP_DIR, p) }
|
29
|
+
@foo.default_metadata_uri.should == File.join(IMWTest::TMP_DIR, basename)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
describe 'returning its metadata' do
|
36
|
+
it "should return 'nil' when no metadata exists on disk" do
|
37
|
+
@foo.metadata.should be_nil
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return Metadata when metadata exists on disk" do
|
41
|
+
IMW.open!(@foo.default_metadata_uri) do |f|
|
42
|
+
f.write <<YAML
|
43
|
+
---
|
44
|
+
foo:
|
45
|
+
description: bar
|
46
|
+
fields: baz
|
47
|
+
YAML
|
48
|
+
end
|
49
|
+
@foo.metadata.class.should == IMW::Metadata
|
50
|
+
@foo.metadata['foo']['description'].should == 'bar'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
end
|
@@ -3,17 +3,16 @@ require File.dirname(__FILE__) + "/../../spec_helper"
|
|
3
3
|
describe IMW::Metadata::Field do
|
4
4
|
|
5
5
|
describe "initializing" do
|
6
|
-
it "should parse a
|
7
|
-
IMW::Metadata::Field.new(
|
8
|
-
IMW::Metadata::Field.new('foobar').should == { :name => 'foobar' }
|
6
|
+
it "should parse a string into a hash" do
|
7
|
+
IMW::Metadata::Field.new('foobar').should == { "name" => 'foobar' }
|
9
8
|
end
|
10
9
|
|
11
10
|
it "should raise an error on a Hash without a :name key" do
|
12
|
-
lambda { IMW::Metadata::Field.new(
|
11
|
+
lambda { IMW::Metadata::Field.new('foo' => 'bar') }.should raise_error(IMW::ArgumentError)
|
13
12
|
end
|
14
13
|
|
15
14
|
it "should accept a Hash with a :name key" do
|
16
|
-
data = {
|
15
|
+
data = { 'name' => :foobar, 'title' => "Bazbooz", 'unit' => "m" }
|
17
16
|
IMW::Metadata::Field.new(data).should == data
|
18
17
|
end
|
19
18
|
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::HasMetadata do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
def uri ; File.join(IMWTest::TMP_DIR, 'test', 'subdir', 'foobar.csv') ; end
|
8
|
+
def basename ; File.basename(uri) ; end
|
9
|
+
def extension ; 'csv' ; end
|
10
|
+
def dir ; IMW.open(File.join(IMWTest::TMP_DIR, 'test', 'subdir')) ; end
|
11
|
+
include IMW::Metadata::HasMetadata
|
12
|
+
end
|
13
|
+
@foo = Foo.new
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should be able to build a schema" do
|
17
|
+
@foo.schema.should include(:type, :namespace, :name, :doc, :fields, :non_avro)
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "finding its metadata" do
|
21
|
+
|
22
|
+
before do
|
23
|
+
FileUtils.mkdir_p(@foo.dir.path)
|
24
|
+
IMWTest::Random.file(File.join(@foo.dir.path, 'foobar.csv'))
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return 'nil' when it can't find any metadata" do
|
28
|
+
@foo.metadata.should be_nil
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should return 'nil' when a metadata file is found that doesn't describe it" do
|
32
|
+
IMW.open!("has_metadata_test.icss.yaml") do |f|
|
33
|
+
f.write <<YAML
|
34
|
+
---
|
35
|
+
foobar.csv:
|
36
|
+
description: bar
|
37
|
+
fields: ["baz", "booz"]
|
38
|
+
YAML
|
39
|
+
end
|
40
|
+
@foo.metadata.should be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
# it "should return the metadata when a metadata file is found that does describe it" do
|
44
|
+
# IMW.open!("has_metadata_test.icss.yaml") do |f|
|
45
|
+
# f.write <<YAML
|
46
|
+
# ---
|
47
|
+
# #{IMWTest::TMP_DIR}/test/subdir/foobar.csv:
|
48
|
+
# description: bar
|
49
|
+
# fields: ["baz", "booz"]
|
50
|
+
# YAML
|
51
|
+
# end
|
52
|
+
# @foo.metadata.class.should == IMW::Metadata
|
53
|
+
# @foo.metadata[@foo]['description'].should == 'bar'
|
54
|
+
# end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Metadata::HasSummary do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Foo
|
7
|
+
def initialize(*args) ; @args = args ; end
|
8
|
+
def uri ; File.join(IMWTest::TMP_DIR, *@args) ; end
|
9
|
+
def basename ; File.basename(uri) ; end
|
10
|
+
def extension ; File.extname(@args.last || '').gsub(/^\./,'') ; end
|
11
|
+
include IMW::Metadata::HasSummary
|
12
|
+
end
|
13
|
+
@foo = Foo.new('foo', 'bar.csv')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should build a summary from an external summary" do
|
17
|
+
@foo.summary.should include(:uri, :basename, :extension)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should build a summary from an external summary and a schema when possible" do
|
21
|
+
@foo.stub!(:schema).and_return({:foo => 'bar'})
|
22
|
+
@foo.summary[:schema].should == {:foo => 'bar'}
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be able to build an external summary describing how it's situated in the world" do
|
26
|
+
@foo.summary[:uri].should == File.join(IMWTest::TMP_DIR, 'foo', 'bar.csv')
|
27
|
+
@foo.summary[:basename].should == 'bar.csv'
|
28
|
+
@foo.summary[:extension].should == 'csv'
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
@@ -3,25 +3,22 @@ require File.dirname(__FILE__) + "/../../spec_helper"
|
|
3
3
|
describe IMW::Metadata::Schema do
|
4
4
|
|
5
5
|
describe "initializing" do
|
6
|
-
it "should
|
7
|
-
IMW::Metadata::Schema.new(
|
6
|
+
it "should merge with a Hash" do
|
7
|
+
IMW::Metadata::Schema.new({:foo => 'foobar'}).should == { :foo => 'foobar' }
|
8
8
|
end
|
9
9
|
|
10
|
-
it "should
|
11
|
-
|
12
|
-
IMW::Metadata::Schema.new(orig_schema).should == orig_schema
|
10
|
+
it "should merge with a Schema" do
|
11
|
+
IMW::Metadata::Schema.new(IMW::Metadata::Schema.new({:foo => 'foobar'})).should == { :foo => 'foobar' }
|
13
12
|
end
|
14
|
-
end
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
resource = IMW.open('some_resource')
|
19
|
-
resource.should_receive(:load).and_return(%w[foo bar baz])
|
20
|
-
IMW.should_receive(:open).and_return(resource)
|
21
|
-
IMW::Metadata::Schema.load(resource.to_s).map { |field| field[:name] }.should == %w[foo bar baz]
|
14
|
+
it "should ignore anything else" do
|
15
|
+
IMW::Metadata::Schema.new('foobar').should == {}
|
22
16
|
end
|
23
17
|
|
18
|
+
it "should accept empty args" do
|
19
|
+
IMW::Metadata::Schema.new.should == {}
|
20
|
+
end
|
21
|
+
|
24
22
|
end
|
25
23
|
|
26
24
|
end
|
27
|
-
|