hydra-file_characterization 0.3.2 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +91 -0
- data/.github_changelog_generator +2 -0
- data/.rubocop.yml +10 -0
- data/.rubocop_todo.yml +92 -0
- data/CHANGELOG.md +189 -0
- data/CODE_OF_CONDUCT.md +36 -0
- data/CONTRIBUTING.md +70 -22
- data/Gemfile +9 -0
- data/Guardfile +1 -0
- data/LICENSE +14 -16
- data/README.md +54 -13
- data/Rakefile +7 -0
- data/SUPPORT.md +5 -0
- data/hydra-file_characterization.gemspec +12 -6
- data/lib/hydra-file_characterization.rb +1 -0
- data/lib/hydra/file_characterization.rb +32 -36
- data/lib/hydra/file_characterization/characterizer.rb +40 -33
- data/lib/hydra/file_characterization/characterizers.rb +5 -3
- data/lib/hydra/file_characterization/characterizers/ffprobe.rb +2 -2
- data/lib/hydra/file_characterization/characterizers/fits.rb +14 -9
- data/lib/hydra/file_characterization/characterizers/fits_servlet.rb +23 -0
- data/lib/hydra/file_characterization/exceptions.rb +1 -2
- data/lib/hydra/file_characterization/to_temp_file.rb +3 -3
- data/lib/hydra/file_characterization/version.rb +2 -1
- data/spec/lib/hydra/file_characterization/characterizer_spec.rb +9 -8
- data/spec/lib/hydra/file_characterization/characterizers/ffprobe_spec.rb +4 -7
- data/spec/lib/hydra/file_characterization/characterizers/fit_servlet_spec.rb +71 -0
- data/spec/lib/hydra/file_characterization/characterizers/fits_spec.rb +35 -12
- data/spec/lib/hydra/file_characterization/characterizers_spec.rb +10 -10
- data/spec/lib/hydra/file_characterization/to_temp_file_spec.rb +2 -4
- data/spec/lib/hydra/file_characterization_spec.rb +8 -3
- data/spec/spec_helper.rb +7 -2
- metadata +84 -6
@@ -1,13 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'hydra/file_characterization/exceptions'
|
2
3
|
require 'hydra/file_characterization/characterizer'
|
3
4
|
|
4
5
|
module Hydra::FileCharacterization::Characterizers
|
5
6
|
class Ffprobe < Hydra::FileCharacterization::Characterizer
|
6
|
-
|
7
7
|
protected
|
8
|
+
|
8
9
|
def command
|
9
10
|
"#{tool_path} -i \"#{filename}\" -print_format xml -show_streams -v quiet"
|
10
11
|
end
|
11
|
-
|
12
12
|
end
|
13
13
|
end
|
@@ -1,18 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'hydra/file_characterization/exceptions'
|
2
3
|
require 'hydra/file_characterization/characterizer'
|
4
|
+
require 'logger'
|
3
5
|
module Hydra::FileCharacterization::Characterizers
|
4
6
|
class Fits < Hydra::FileCharacterization::Characterizer
|
5
|
-
|
6
7
|
protected
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
def command
|
10
|
+
"#{tool_path} -i \"#{filename}\""
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
# Remove any non-XML output that precedes the <?xml> tag
|
14
|
+
# See: https://github.com/harvard-lts/fits/issues/20
|
15
|
+
# https://github.com/harvard-lts/fits/issues/40
|
16
|
+
# https://github.com/harvard-lts/fits/issues/46
|
17
|
+
def post_process(raw_output)
|
18
|
+
md = /\A(.*)(<\?xml.*)\Z/m.match(raw_output)
|
19
|
+
logger.warn "FITS produced non-xml output: \"#{md[1].chomp}\"" unless md[1].empty?
|
20
|
+
md[2]
|
21
|
+
end
|
17
22
|
end
|
18
23
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'hydra/file_characterization/exceptions'
|
3
|
+
require 'hydra/file_characterization/characterizer'
|
4
|
+
require 'logger'
|
5
|
+
module Hydra::FileCharacterization::Characterizers
|
6
|
+
class FitsServlet < Hydra::FileCharacterization::Characterizer
|
7
|
+
protected
|
8
|
+
|
9
|
+
def command
|
10
|
+
"curl -k -F datafile=@#{filename} #{ENV['FITS_SERVLET_URL']}/examine"
|
11
|
+
end
|
12
|
+
|
13
|
+
# Remove any non-XML output that precedes the <?xml> tag
|
14
|
+
# See: https://github.com/harvard-lts/fits/issues/20
|
15
|
+
# https://github.com/harvard-lts/fits/issues/40
|
16
|
+
# https://github.com/harvard-lts/fits/issues/46
|
17
|
+
def post_process(raw_output)
|
18
|
+
md = /\A(.*)(<\?xml.*)\Z/m.match(raw_output)
|
19
|
+
logger.warn "FITS produced non-xml output: \"#{md[1].chomp}\"" unless md[1].empty?
|
20
|
+
md[2]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Hydra::FileCharacterization
|
2
|
-
|
3
3
|
class FileNotFoundError < RuntimeError
|
4
4
|
end
|
5
5
|
|
@@ -8,5 +8,4 @@ module Hydra::FileCharacterization
|
|
8
8
|
super("Unable to find Hydra::FileCharacterization tool with name :#{tool_name}")
|
9
9
|
end
|
10
10
|
end
|
11
|
-
|
12
11
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'open3'
|
2
3
|
require 'tempfile'
|
3
4
|
|
@@ -16,7 +17,7 @@ module Hydra::FileCharacterization
|
|
16
17
|
end
|
17
18
|
|
18
19
|
def call(data)
|
19
|
-
f = Tempfile.new([File.basename(filename),File.extname(filename)])
|
20
|
+
f = Tempfile.new([File.basename(filename), File.extname(filename)])
|
20
21
|
begin
|
21
22
|
f.binmode
|
22
23
|
if data.respond_to? :read
|
@@ -32,6 +33,5 @@ module Hydra::FileCharacterization
|
|
32
33
|
f.unlink
|
33
34
|
end
|
34
35
|
end
|
35
|
-
|
36
36
|
end
|
37
|
-
end
|
37
|
+
end
|
@@ -1,30 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'spec_helper'
|
2
3
|
|
3
4
|
module Hydra::FileCharacterization
|
4
5
|
describe Characterizer do
|
6
|
+
subject { characterizer }
|
5
7
|
let(:filename) { __FILE__ }
|
6
8
|
let(:instance_tool_path) { nil }
|
7
9
|
let(:class_tool_path) { nil }
|
8
10
|
|
9
|
-
let(:characterizer) {
|
10
|
-
|
11
|
-
|
12
|
-
Hydra::FileCharacterization::Characterizer.tool_path = class_tool_path
|
11
|
+
let(:characterizer) { described_class.new(filename, instance_tool_path) }
|
12
|
+
around do |example|
|
13
|
+
described_class.tool_path = class_tool_path
|
13
14
|
example.run
|
14
|
-
|
15
|
+
described_class.tool_path = nil
|
15
16
|
end
|
16
17
|
|
17
18
|
context 'call' do
|
18
19
|
context 'with missing file' do
|
19
20
|
let(:filename) { '/dev/path/to/bogus/file' }
|
20
|
-
it '
|
21
|
+
it 'raises FileNotFoundError' do
|
21
22
|
expect { subject.call }.to raise_error(FileNotFoundError)
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
25
26
|
context 'with a callable tool path' do
|
26
|
-
let(:class_tool_path) {
|
27
|
-
it '
|
27
|
+
let(:class_tool_path) { ->(filename) { [filename, :output] } }
|
28
|
+
it 'raises FileNotFoundError' do
|
28
29
|
expect(subject.call).to eq [filename, :output]
|
29
30
|
end
|
30
31
|
end
|
@@ -1,19 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'spec_helper'
|
2
3
|
require 'hydra/file_characterization/characterizers/ffprobe'
|
3
4
|
|
4
5
|
module Hydra::FileCharacterization::Characterizers
|
5
|
-
|
6
6
|
describe Ffprobe do
|
7
|
-
|
8
|
-
subject { Ffprobe.new(filename) }
|
7
|
+
subject { described_class.new(filename) }
|
9
8
|
|
10
9
|
describe 'invalidFile' do
|
11
10
|
let(:filename) { fixture_file('nofile.pdf') }
|
12
|
-
it "
|
13
|
-
expect {subject.call}.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
|
11
|
+
it "raises an error if the path does not contain the file" do
|
12
|
+
expect { subject.call }.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
|
14
13
|
end
|
15
14
|
end
|
16
|
-
|
17
15
|
end
|
18
|
-
|
19
16
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'hydra/file_characterization/characterizers/fits_servlet'
|
4
|
+
|
5
|
+
module Hydra::FileCharacterization::Characterizers
|
6
|
+
describe FitsServlet do
|
7
|
+
let(:fits) { Fits.new(filename) }
|
8
|
+
|
9
|
+
describe "#call", unless: ENV['TRAVIS'] do
|
10
|
+
subject { fits.call }
|
11
|
+
|
12
|
+
context 'validfile' do
|
13
|
+
let(:filename) { fixture_file('brendan_behan.jpeg') }
|
14
|
+
it { is_expected.to include(%(<identity format="JPEG File Interchange Format" mimetype="image/jpeg")) }
|
15
|
+
end
|
16
|
+
|
17
|
+
context 'invalidFile' do
|
18
|
+
let(:filename) { fixture_file('nofile.pdf') }
|
19
|
+
it "raises an error" do
|
20
|
+
expect { subject }.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context 'corruptFile' do
|
25
|
+
let(:filename) { fixture_file('brendan_broken.dxxd') }
|
26
|
+
it { is_expected.to include(%(<identity format="Unknown Binary" mimetype="application/octet-stream")) }
|
27
|
+
end
|
28
|
+
|
29
|
+
context 'zip file should be characterized not its contents' do
|
30
|
+
let(:filename) { fixture_file('archive.zip') }
|
31
|
+
it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip")) }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context 'when JHOVE adds non-xml' do
|
36
|
+
# https://github.com/harvard-lts/fits/issues/20
|
37
|
+
subject { fits.call }
|
38
|
+
|
39
|
+
before do
|
40
|
+
expect(fits.logger).to receive(:warn)
|
41
|
+
allow(fits).to receive(:internal_call).and_return(
|
42
|
+
'READBOX seen=true
|
43
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
44
|
+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
|
45
|
+
<identification/></fits>'
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
let(:filename) { fixture_file('brendan_behan.jpeg') }
|
50
|
+
it { is_expected.not_to include('READBOX') }
|
51
|
+
end
|
52
|
+
|
53
|
+
context "when FITS itself adds non-xml" do
|
54
|
+
# https://github.com/harvard-lts/fits/issues/46
|
55
|
+
subject { fits.call }
|
56
|
+
|
57
|
+
before do
|
58
|
+
expect(fits.logger).to receive(:warn)
|
59
|
+
allow(fits).to receive(:internal_call).and_return(
|
60
|
+
'2015-10-15 17:14:25,761 ERROR [main] ToolBelt:79 - Thread 1 error initializing edu.harvard.hul.ois.fits.tools.droid.Droid: edu.harvard.hul.ois.fits.exceptions.FitsToolException Message: DROID cannot run under Java 8
|
61
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
62
|
+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
|
63
|
+
<identification/></fits>'
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
let(:filename) { fixture_file('brendan_behan.jpeg') }
|
68
|
+
it { is_expected.not_to include('FitsToolException') }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -1,11 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'spec_helper'
|
2
3
|
require 'hydra/file_characterization/characterizers/fits'
|
3
4
|
|
4
5
|
module Hydra::FileCharacterization::Characterizers
|
5
6
|
describe Fits do
|
6
|
-
let(:fits) {
|
7
|
+
let(:fits) { described_class.new(filename) }
|
7
8
|
|
8
|
-
describe "#call" do
|
9
|
+
describe "#call", unless: ENV['TRAVIS'] do
|
9
10
|
subject { fits.call }
|
10
11
|
|
11
12
|
context 'validfile' do
|
@@ -27,22 +28,44 @@ module Hydra::FileCharacterization::Characterizers
|
|
27
28
|
|
28
29
|
context 'zip file should be characterized not its contents' do
|
29
30
|
let(:filename) { fixture_file('archive.zip') }
|
30
|
-
it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip"))}
|
31
|
+
it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip")) }
|
31
32
|
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context 'when JHOVE adds non-xml' do
|
36
|
+
# https://github.com/harvard-lts/fits/issues/20
|
37
|
+
subject { fits.call }
|
32
38
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
'READBOX seen=true
|
39
|
+
before do
|
40
|
+
expect(fits.logger).to receive(:warn)
|
41
|
+
allow(fits).to receive(:internal_call).and_return(
|
42
|
+
'READBOX seen=true
|
38
43
|
<?xml version="1.0" encoding="UTF-8"?>
|
39
44
|
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
|
40
|
-
|
41
|
-
|
45
|
+
<identification/></fits>'
|
46
|
+
)
|
47
|
+
end
|
42
48
|
|
43
|
-
|
44
|
-
|
49
|
+
let(:filename) { fixture_file('brendan_behan.jpeg') }
|
50
|
+
it { is_expected.not_to include('READBOX') }
|
51
|
+
end
|
52
|
+
|
53
|
+
context "when FITS itself adds non-xml" do
|
54
|
+
# https://github.com/harvard-lts/fits/issues/46
|
55
|
+
subject { fits.call }
|
56
|
+
|
57
|
+
before do
|
58
|
+
expect(fits.logger).to receive(:warn)
|
59
|
+
allow(fits).to receive(:internal_call).and_return(
|
60
|
+
'2015-10-15 17:14:25,761 ERROR [main] ToolBelt:79 - Thread 1 error initializing edu.harvard.hul.ois.fits.tools.droid.Droid: edu.harvard.hul.ois.fits.exceptions.FitsToolException Message: DROID cannot run under Java 8
|
61
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
62
|
+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
|
63
|
+
<identification/></fits>'
|
64
|
+
)
|
45
65
|
end
|
66
|
+
|
67
|
+
let(:filename) { fixture_file('brendan_behan.jpeg') }
|
68
|
+
it { is_expected.not_to include('FitsToolException') }
|
46
69
|
end
|
47
70
|
end
|
48
71
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'spec_helper'
|
2
3
|
require 'hydra/file_characterization/characterizers'
|
3
4
|
|
@@ -7,36 +8,35 @@ module Hydra::FileCharacterization
|
|
7
8
|
|
8
9
|
describe 'with :fits tool_name' do
|
9
10
|
let(:tool_name) { :fits }
|
10
|
-
it {
|
11
|
+
it { is_expected.to eq(Characterizers::Fits) }
|
11
12
|
end
|
12
13
|
|
13
14
|
describe 'with :ffprobe tool_name' do
|
14
15
|
let(:tool_name) { :ffprobe }
|
15
|
-
it {
|
16
|
+
it { is_expected.to eq(Characterizers::Ffprobe) }
|
16
17
|
end
|
17
18
|
|
18
19
|
context '.characterize_with' do
|
20
|
+
subject { Hydra::FileCharacterization.characterize_with(tool_name, filename, tool_path) }
|
19
21
|
let(:tool_name) { :fits }
|
20
22
|
let(:filename) { __FILE__ }
|
21
23
|
let(:tool_path) { nil }
|
22
|
-
subject { Hydra::FileCharacterization.characterize_with(tool_name, filename, tool_path) }
|
23
24
|
|
24
25
|
context 'with callable tool_path and missing tool name' do
|
25
|
-
let(:tool_path) {
|
26
|
+
let(:tool_path) { ->(filename) { [filename, :tool_path] } }
|
26
27
|
let(:tool_name) { :chunky_salsa }
|
27
|
-
it {
|
28
|
+
it { is_expected.to eq [filename, :tool_path] }
|
28
29
|
end
|
29
30
|
|
30
31
|
context 'with missing tool name and non-callable tool_path' do
|
31
32
|
let(:tool_name) { :chunky_salsa }
|
32
33
|
let(:tool_path) { '/path' }
|
33
|
-
it '
|
34
|
-
expect
|
34
|
+
it 'raises exception' do
|
35
|
+
expect do
|
35
36
|
subject
|
36
|
-
|
37
|
+
end.to raise_error(ToolNotFoundError)
|
37
38
|
end
|
38
39
|
end
|
39
40
|
end
|
40
|
-
|
41
41
|
end
|
42
|
-
end
|
42
|
+
end
|
@@ -1,10 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
require 'spec_helper'
|
2
3
|
require 'hydra/file_characterization/to_temp_file'
|
3
4
|
|
4
5
|
module Hydra::FileCharacterization
|
5
|
-
|
6
6
|
describe 'ToTempFile' do
|
7
|
-
|
8
7
|
let(:content) { "This is the content of the file." }
|
9
8
|
let(:filename) { "hello.rb" }
|
10
9
|
|
@@ -44,6 +43,5 @@ module Hydra::FileCharacterization
|
|
44
43
|
end
|
45
44
|
end
|
46
45
|
end
|
47
|
-
|
48
46
|
end
|
49
|
-
end
|
47
|
+
end
|
@@ -6,7 +6,7 @@ module Hydra
|
|
6
6
|
|
7
7
|
describe FileCharacterization do
|
8
8
|
|
9
|
-
describe '.characterize' do
|
9
|
+
describe '.characterize', unless: ENV['TRAVIS'] do
|
10
10
|
describe "for content in memory" do
|
11
11
|
let(:content) { "class Test; end\n" }
|
12
12
|
let(:filename) { 'test.rb' }
|
@@ -18,10 +18,15 @@ module Hydra
|
|
18
18
|
end
|
19
19
|
|
20
20
|
describe 'with configured path' do
|
21
|
+
let(:tool_path) do
|
22
|
+
`which fits || which fits.sh`.strip
|
23
|
+
end
|
24
|
+
|
21
25
|
it {
|
22
26
|
response = Hydra::FileCharacterization.characterize(content, filename, :fits) do |config|
|
23
|
-
config[:fits] =
|
27
|
+
config[:fits] = tool_path
|
24
28
|
end
|
29
|
+
|
25
30
|
expect(response).to match(/#{'<identity format="Plain text" mimetype="text/plain"'}/)
|
26
31
|
}
|
27
32
|
end
|
@@ -89,7 +94,7 @@ module Hydra
|
|
89
94
|
Hydra::FileCharacterization::Characterizers::Fits.tool_path = old_tool_path
|
90
95
|
end
|
91
96
|
|
92
|
-
it 'without configuration' do
|
97
|
+
it 'without configuration', unless: ENV['CI'] do
|
93
98
|
Hydra::FileCharacterization.configure do |config|
|
94
99
|
config.tool_path(:fits, nil)
|
95
100
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
3
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
4
|
# Require this file using `require "spec_helper"` to ensure that it is only
|
@@ -6,7 +7,10 @@
|
|
6
7
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
8
|
|
8
9
|
GEM_ROOT = File.expand_path("../../", __FILE__)
|
9
|
-
|
10
|
+
$LOAD_PATH.unshift File.join(GEM_ROOT, "lib")
|
11
|
+
|
12
|
+
require 'coveralls'
|
13
|
+
Coveralls.wear!
|
10
14
|
|
11
15
|
require 'hydra/file_characterization'
|
12
16
|
|
@@ -27,6 +31,7 @@ RSpec.configure do |config|
|
|
27
31
|
# --seed 1234
|
28
32
|
config.order = 'random'
|
29
33
|
config.before(:suite) do
|
30
|
-
|
34
|
+
tool_path = `which fits || which fits.sh`.strip
|
35
|
+
Hydra::FileCharacterization::Characterizers::Fits.tool_path = tool_path
|
31
36
|
end
|
32
37
|
end
|