hydra-file_characterization 0.3.2 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +91 -0
  3. data/.github_changelog_generator +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.rubocop_todo.yml +92 -0
  6. data/CHANGELOG.md +189 -0
  7. data/CODE_OF_CONDUCT.md +36 -0
  8. data/CONTRIBUTING.md +70 -22
  9. data/Gemfile +9 -0
  10. data/Guardfile +1 -0
  11. data/LICENSE +14 -16
  12. data/README.md +54 -13
  13. data/Rakefile +7 -0
  14. data/SUPPORT.md +5 -0
  15. data/hydra-file_characterization.gemspec +12 -6
  16. data/lib/hydra-file_characterization.rb +1 -0
  17. data/lib/hydra/file_characterization.rb +32 -36
  18. data/lib/hydra/file_characterization/characterizer.rb +40 -33
  19. data/lib/hydra/file_characterization/characterizers.rb +5 -3
  20. data/lib/hydra/file_characterization/characterizers/ffprobe.rb +2 -2
  21. data/lib/hydra/file_characterization/characterizers/fits.rb +14 -9
  22. data/lib/hydra/file_characterization/characterizers/fits_servlet.rb +23 -0
  23. data/lib/hydra/file_characterization/exceptions.rb +1 -2
  24. data/lib/hydra/file_characterization/to_temp_file.rb +3 -3
  25. data/lib/hydra/file_characterization/version.rb +2 -1
  26. data/spec/lib/hydra/file_characterization/characterizer_spec.rb +9 -8
  27. data/spec/lib/hydra/file_characterization/characterizers/ffprobe_spec.rb +4 -7
  28. data/spec/lib/hydra/file_characterization/characterizers/fit_servlet_spec.rb +71 -0
  29. data/spec/lib/hydra/file_characterization/characterizers/fits_spec.rb +35 -12
  30. data/spec/lib/hydra/file_characterization/characterizers_spec.rb +10 -10
  31. data/spec/lib/hydra/file_characterization/to_temp_file_spec.rb +2 -4
  32. data/spec/lib/hydra/file_characterization_spec.rb +8 -3
  33. data/spec/spec_helper.rb +7 -2
  34. metadata +84 -6
@@ -1,13 +1,13 @@
1
+ # frozen_string_literal: true
1
2
  require 'hydra/file_characterization/exceptions'
2
3
  require 'hydra/file_characterization/characterizer'
3
4
 
4
5
  module Hydra::FileCharacterization::Characterizers
5
6
  class Ffprobe < Hydra::FileCharacterization::Characterizer
6
-
7
7
  protected
8
+
8
9
  def command
9
10
  "#{tool_path} -i \"#{filename}\" -print_format xml -show_streams -v quiet"
10
11
  end
11
-
12
12
  end
13
13
  end
@@ -1,18 +1,23 @@
1
+ # frozen_string_literal: true
1
2
  require 'hydra/file_characterization/exceptions'
2
3
  require 'hydra/file_characterization/characterizer'
4
+ require 'logger'
3
5
  module Hydra::FileCharacterization::Characterizers
4
6
  class Fits < Hydra::FileCharacterization::Characterizer
5
-
6
7
  protected
7
8
 
8
- def command
9
- "#{tool_path} -i \"#{filename}\""
10
- end
9
+ def command
10
+ "#{tool_path} -i \"#{filename}\""
11
+ end
11
12
 
12
- # Remove any residual non-XML from JHOVE
13
- # See: https://github.com/harvard-lts/fits/issues/20
14
- def post_process(raw_output)
15
- raw_output.sub(/^READBOX seen=true\n/, '')
16
- end
13
+ # Remove any non-XML output that precedes the <?xml> tag
14
+ # See: https://github.com/harvard-lts/fits/issues/20
15
+ # https://github.com/harvard-lts/fits/issues/40
16
+ # https://github.com/harvard-lts/fits/issues/46
17
+ def post_process(raw_output)
18
+ md = /\A(.*)(<\?xml.*)\Z/m.match(raw_output)
19
+ logger.warn "FITS produced non-xml output: \"#{md[1].chomp}\"" unless md[1].empty?
20
+ md[2]
21
+ end
17
22
  end
18
23
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+ require 'hydra/file_characterization/exceptions'
3
+ require 'hydra/file_characterization/characterizer'
4
+ require 'logger'
5
+ module Hydra::FileCharacterization::Characterizers
6
+ class FitsServlet < Hydra::FileCharacterization::Characterizer
7
+ protected
8
+
9
+ def command
10
+ "curl -k -F datafile=@#{filename} #{ENV['FITS_SERVLET_URL']}/examine"
11
+ end
12
+
13
+ # Remove any non-XML output that precedes the <?xml> tag
14
+ # See: https://github.com/harvard-lts/fits/issues/20
15
+ # https://github.com/harvard-lts/fits/issues/40
16
+ # https://github.com/harvard-lts/fits/issues/46
17
+ def post_process(raw_output)
18
+ md = /\A(.*)(<\?xml.*)\Z/m.match(raw_output)
19
+ logger.warn "FITS produced non-xml output: \"#{md[1].chomp}\"" unless md[1].empty?
20
+ md[2]
21
+ end
22
+ end
23
+ end
@@ -1,5 +1,5 @@
1
+ # frozen_string_literal: true
1
2
  module Hydra::FileCharacterization
2
-
3
3
  class FileNotFoundError < RuntimeError
4
4
  end
5
5
 
@@ -8,5 +8,4 @@ module Hydra::FileCharacterization
8
8
  super("Unable to find Hydra::FileCharacterization tool with name :#{tool_name}")
9
9
  end
10
10
  end
11
-
12
11
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  require 'open3'
2
3
  require 'tempfile'
3
4
 
@@ -16,7 +17,7 @@ module Hydra::FileCharacterization
16
17
  end
17
18
 
18
19
  def call(data)
19
- f = Tempfile.new([File.basename(filename),File.extname(filename)])
20
+ f = Tempfile.new([File.basename(filename), File.extname(filename)])
20
21
  begin
21
22
  f.binmode
22
23
  if data.respond_to? :read
@@ -32,6 +33,5 @@ module Hydra::FileCharacterization
32
33
  f.unlink
33
34
  end
34
35
  end
35
-
36
36
  end
37
- end
37
+ end
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Hydra
2
3
  module FileCharacterization
3
- VERSION = "0.3.2"
4
+ VERSION = "1.1.2"
4
5
  end
5
6
  end
@@ -1,30 +1,31 @@
1
+ # frozen_string_literal: true
1
2
  require 'spec_helper'
2
3
 
3
4
  module Hydra::FileCharacterization
4
5
  describe Characterizer do
6
+ subject { characterizer }
5
7
  let(:filename) { __FILE__ }
6
8
  let(:instance_tool_path) { nil }
7
9
  let(:class_tool_path) { nil }
8
10
 
9
- let(:characterizer) { Hydra::FileCharacterization::Characterizer.new(filename, instance_tool_path) }
10
- subject { characterizer }
11
- around(:each) do |example|
12
- Hydra::FileCharacterization::Characterizer.tool_path = class_tool_path
11
+ let(:characterizer) { described_class.new(filename, instance_tool_path) }
12
+ around do |example|
13
+ described_class.tool_path = class_tool_path
13
14
  example.run
14
- Hydra::FileCharacterization::Characterizer.tool_path = nil
15
+ described_class.tool_path = nil
15
16
  end
16
17
 
17
18
  context 'call' do
18
19
  context 'with missing file' do
19
20
  let(:filename) { '/dev/path/to/bogus/file' }
20
- it 'should raise FileNotFoundError' do
21
+ it 'raises FileNotFoundError' do
21
22
  expect { subject.call }.to raise_error(FileNotFoundError)
22
23
  end
23
24
  end
24
25
 
25
26
  context 'with a callable tool path' do
26
- let(:class_tool_path) { lambda { |filename| [filename, :output] }}
27
- it 'should raise FileNotFoundError' do
27
+ let(:class_tool_path) { ->(filename) { [filename, :output] } }
28
+ it 'raises FileNotFoundError' do
28
29
  expect(subject.call).to eq [filename, :output]
29
30
  end
30
31
  end
@@ -1,19 +1,16 @@
1
+ # frozen_string_literal: true
1
2
  require 'spec_helper'
2
3
  require 'hydra/file_characterization/characterizers/ffprobe'
3
4
 
4
5
  module Hydra::FileCharacterization::Characterizers
5
-
6
6
  describe Ffprobe do
7
-
8
- subject { Ffprobe.new(filename) }
7
+ subject { described_class.new(filename) }
9
8
 
10
9
  describe 'invalidFile' do
11
10
  let(:filename) { fixture_file('nofile.pdf') }
12
- it "should raise an error if the path does not contain the file" do
13
- expect {subject.call}.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
11
+ it "raises an error if the path does not contain the file" do
12
+ expect { subject.call }.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
14
13
  end
15
14
  end
16
-
17
15
  end
18
-
19
16
  end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+ require 'spec_helper'
3
+ require 'hydra/file_characterization/characterizers/fits_servlet'
4
+
5
+ module Hydra::FileCharacterization::Characterizers
6
+ describe FitsServlet do
7
+ let(:fits) { Fits.new(filename) }
8
+
9
+ describe "#call", unless: ENV['TRAVIS'] do
10
+ subject { fits.call }
11
+
12
+ context 'validfile' do
13
+ let(:filename) { fixture_file('brendan_behan.jpeg') }
14
+ it { is_expected.to include(%(<identity format="JPEG File Interchange Format" mimetype="image/jpeg")) }
15
+ end
16
+
17
+ context 'invalidFile' do
18
+ let(:filename) { fixture_file('nofile.pdf') }
19
+ it "raises an error" do
20
+ expect { subject }.to raise_error(Hydra::FileCharacterization::FileNotFoundError)
21
+ end
22
+ end
23
+
24
+ context 'corruptFile' do
25
+ let(:filename) { fixture_file('brendan_broken.dxxd') }
26
+ it { is_expected.to include(%(<identity format="Unknown Binary" mimetype="application/octet-stream")) }
27
+ end
28
+
29
+ context 'zip file should be characterized not its contents' do
30
+ let(:filename) { fixture_file('archive.zip') }
31
+ it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip")) }
32
+ end
33
+ end
34
+
35
+ context 'when JHOVE adds non-xml' do
36
+ # https://github.com/harvard-lts/fits/issues/20
37
+ subject { fits.call }
38
+
39
+ before do
40
+ expect(fits.logger).to receive(:warn)
41
+ allow(fits).to receive(:internal_call).and_return(
42
+ 'READBOX seen=true
43
+ <?xml version="1.0" encoding="UTF-8"?>
44
+ <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
45
+ <identification/></fits>'
46
+ )
47
+ end
48
+
49
+ let(:filename) { fixture_file('brendan_behan.jpeg') }
50
+ it { is_expected.not_to include('READBOX') }
51
+ end
52
+
53
+ context "when FITS itself adds non-xml" do
54
+ # https://github.com/harvard-lts/fits/issues/46
55
+ subject { fits.call }
56
+
57
+ before do
58
+ expect(fits.logger).to receive(:warn)
59
+ allow(fits).to receive(:internal_call).and_return(
60
+ '2015-10-15 17:14:25,761 ERROR [main] ToolBelt:79 - Thread 1 error initializing edu.harvard.hul.ois.fits.tools.droid.Droid: edu.harvard.hul.ois.fits.exceptions.FitsToolException Message: DROID cannot run under Java 8
61
+ <?xml version="1.0" encoding="UTF-8"?>
62
+ <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
63
+ <identification/></fits>'
64
+ )
65
+ end
66
+
67
+ let(:filename) { fixture_file('brendan_behan.jpeg') }
68
+ it { is_expected.not_to include('FitsToolException') }
69
+ end
70
+ end
71
+ end
@@ -1,11 +1,12 @@
1
+ # frozen_string_literal: true
1
2
  require 'spec_helper'
2
3
  require 'hydra/file_characterization/characterizers/fits'
3
4
 
4
5
  module Hydra::FileCharacterization::Characterizers
5
6
  describe Fits do
6
- let(:fits) { Fits.new(filename) }
7
+ let(:fits) { described_class.new(filename) }
7
8
 
8
- describe "#call" do
9
+ describe "#call", unless: ENV['TRAVIS'] do
9
10
  subject { fits.call }
10
11
 
11
12
  context 'validfile' do
@@ -27,22 +28,44 @@ module Hydra::FileCharacterization::Characterizers
27
28
 
28
29
  context 'zip file should be characterized not its contents' do
29
30
  let(:filename) { fixture_file('archive.zip') }
30
- it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip"))}
31
+ it { is_expected.to include(%(<identity format="ZIP Format" mimetype="application/zip")) }
31
32
  end
33
+ end
34
+
35
+ context 'when JHOVE adds non-xml' do
36
+ # https://github.com/harvard-lts/fits/issues/20
37
+ subject { fits.call }
32
38
 
33
- context 'when JHOVE adds non-xml' do
34
- # https://github.com/harvard-lts/fits/issues/20
35
- before do
36
- allow(fits).to receive(:internal_call).and_return(
37
- 'READBOX seen=true
39
+ before do
40
+ expect(fits.logger).to receive(:warn)
41
+ allow(fits).to receive(:internal_call).and_return(
42
+ 'READBOX seen=true
38
43
  <?xml version="1.0" encoding="UTF-8"?>
39
44
  <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
40
- <identification/></fits>')
41
- end
45
+ <identification/></fits>'
46
+ )
47
+ end
42
48
 
43
- let(:filename) { fixture_file('brendan_behan.jpeg') }
44
- it { is_expected.not_to include('READBOX') }
49
+ let(:filename) { fixture_file('brendan_behan.jpeg') }
50
+ it { is_expected.not_to include('READBOX') }
51
+ end
52
+
53
+ context "when FITS itself adds non-xml" do
54
+ # https://github.com/harvard-lts/fits/issues/46
55
+ subject { fits.call }
56
+
57
+ before do
58
+ expect(fits.logger).to receive(:warn)
59
+ allow(fits).to receive(:internal_call).and_return(
60
+ '2015-10-15 17:14:25,761 ERROR [main] ToolBelt:79 - Thread 1 error initializing edu.harvard.hul.ois.fits.tools.droid.Droid: edu.harvard.hul.ois.fits.exceptions.FitsToolException Message: DROID cannot run under Java 8
61
+ <?xml version="1.0" encoding="UTF-8"?>
62
+ <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.2" timestamp="15/09/14 10:00 AM">
63
+ <identification/></fits>'
64
+ )
45
65
  end
66
+
67
+ let(:filename) { fixture_file('brendan_behan.jpeg') }
68
+ it { is_expected.not_to include('FitsToolException') }
46
69
  end
47
70
  end
48
71
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  require 'spec_helper'
2
3
  require 'hydra/file_characterization/characterizers'
3
4
 
@@ -7,36 +8,35 @@ module Hydra::FileCharacterization
7
8
 
8
9
  describe 'with :fits tool_name' do
9
10
  let(:tool_name) { :fits }
10
- it { should eq(Characterizers::Fits) }
11
+ it { is_expected.to eq(Characterizers::Fits) }
11
12
  end
12
13
 
13
14
  describe 'with :ffprobe tool_name' do
14
15
  let(:tool_name) { :ffprobe }
15
- it { should eq(Characterizers::Ffprobe) }
16
+ it { is_expected.to eq(Characterizers::Ffprobe) }
16
17
  end
17
18
 
18
19
  context '.characterize_with' do
20
+ subject { Hydra::FileCharacterization.characterize_with(tool_name, filename, tool_path) }
19
21
  let(:tool_name) { :fits }
20
22
  let(:filename) { __FILE__ }
21
23
  let(:tool_path) { nil }
22
- subject { Hydra::FileCharacterization.characterize_with(tool_name, filename, tool_path) }
23
24
 
24
25
  context 'with callable tool_path and missing tool name' do
25
- let(:tool_path) { lambda {|filename| [filename, :tool_path]} }
26
+ let(:tool_path) { ->(filename) { [filename, :tool_path] } }
26
27
  let(:tool_name) { :chunky_salsa }
27
- it { should eq [filename, :tool_path] }
28
+ it { is_expected.to eq [filename, :tool_path] }
28
29
  end
29
30
 
30
31
  context 'with missing tool name and non-callable tool_path' do
31
32
  let(:tool_name) { :chunky_salsa }
32
33
  let(:tool_path) { '/path' }
33
- it 'should raise exception' do
34
- expect {
34
+ it 'raises exception' do
35
+ expect do
35
36
  subject
36
- }.to raise_error(ToolNotFoundError)
37
+ end.to raise_error(ToolNotFoundError)
37
38
  end
38
39
  end
39
40
  end
40
-
41
41
  end
42
- end
42
+ end
@@ -1,10 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  require 'spec_helper'
2
3
  require 'hydra/file_characterization/to_temp_file'
3
4
 
4
5
  module Hydra::FileCharacterization
5
-
6
6
  describe 'ToTempFile' do
7
-
8
7
  let(:content) { "This is the content of the file." }
9
8
  let(:filename) { "hello.rb" }
10
9
 
@@ -44,6 +43,5 @@ module Hydra::FileCharacterization
44
43
  end
45
44
  end
46
45
  end
47
-
48
46
  end
49
- end
47
+ end
@@ -6,7 +6,7 @@ module Hydra
6
6
 
7
7
  describe FileCharacterization do
8
8
 
9
- describe '.characterize' do
9
+ describe '.characterize', unless: ENV['TRAVIS'] do
10
10
  describe "for content in memory" do
11
11
  let(:content) { "class Test; end\n" }
12
12
  let(:filename) { 'test.rb' }
@@ -18,10 +18,15 @@ module Hydra
18
18
  end
19
19
 
20
20
  describe 'with configured path' do
21
+ let(:tool_path) do
22
+ `which fits || which fits.sh`.strip
23
+ end
24
+
21
25
  it {
22
26
  response = Hydra::FileCharacterization.characterize(content, filename, :fits) do |config|
23
- config[:fits] = `which fits || which fits.sh`.strip
27
+ config[:fits] = tool_path
24
28
  end
29
+
25
30
  expect(response).to match(/#{'<identity format="Plain text" mimetype="text/plain"'}/)
26
31
  }
27
32
  end
@@ -89,7 +94,7 @@ module Hydra
89
94
  Hydra::FileCharacterization::Characterizers::Fits.tool_path = old_tool_path
90
95
  end
91
96
 
92
- it 'without configuration' do
97
+ it 'without configuration', unless: ENV['CI'] do
93
98
  Hydra::FileCharacterization.configure do |config|
94
99
  config.tool_path(:fits, nil)
95
100
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  # This file was generated by the `rspec --init` command. Conventionally, all
2
3
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
4
  # Require this file using `require "spec_helper"` to ensure that it is only
@@ -6,7 +7,10 @@
6
7
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
8
 
8
9
  GEM_ROOT = File.expand_path("../../", __FILE__)
9
- $:.unshift File.join(GEM_ROOT, "lib")
10
+ $LOAD_PATH.unshift File.join(GEM_ROOT, "lib")
11
+
12
+ require 'coveralls'
13
+ Coveralls.wear!
10
14
 
11
15
  require 'hydra/file_characterization'
12
16
 
@@ -27,6 +31,7 @@ RSpec.configure do |config|
27
31
  # --seed 1234
28
32
  config.order = 'random'
29
33
  config.before(:suite) do
30
- Hydra::FileCharacterization::Characterizers::Fits.tool_path = `which fits || which fits.sh`.strip
34
+ tool_path = `which fits || which fits.sh`.strip
35
+ Hydra::FileCharacterization::Characterizers::Fits.tool_path = tool_path
31
36
  end
32
37
  end