format_parser 0.25.4 → 0.28.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +104 -0
  3. data/CHANGELOG.md +15 -0
  4. data/README.md +4 -0
  5. data/format_parser.gemspec +1 -0
  6. data/lib/archive.rb +3 -0
  7. data/lib/audio.rb +3 -0
  8. data/lib/document.rb +1 -0
  9. data/lib/format_parser.rb +18 -3
  10. data/lib/format_parser/version.rb +1 -1
  11. data/lib/image.rb +3 -0
  12. data/lib/parsers/aiff_parser.rb +4 -1
  13. data/lib/parsers/bmp_parser.rb +3 -0
  14. data/lib/parsers/cr2_parser.rb +2 -0
  15. data/lib/parsers/dpx_parser.rb +19 -8
  16. data/lib/parsers/flac_parser.rb +2 -0
  17. data/lib/parsers/gif_parser.rb +2 -0
  18. data/lib/parsers/jpeg_parser.rb +2 -0
  19. data/lib/parsers/m3u_parser.rb +23 -0
  20. data/lib/parsers/moov_parser.rb +10 -1
  21. data/lib/parsers/mp3_parser.rb +3 -2
  22. data/lib/parsers/ogg_parser.rb +3 -2
  23. data/lib/parsers/pdf_parser.rb +2 -2
  24. data/lib/parsers/png_parser.rb +2 -0
  25. data/lib/parsers/psd_parser.rb +2 -0
  26. data/lib/parsers/tiff_parser.rb +10 -2
  27. data/lib/parsers/wav_parser.rb +3 -0
  28. data/lib/parsers/zip_parser.rb +5 -3
  29. data/lib/parsers/zip_parser/office_formats.rb +5 -5
  30. data/lib/remote_io.rb +7 -1
  31. data/lib/text.rb +19 -0
  32. data/lib/video.rb +3 -0
  33. data/spec/format_parser_spec.rb +20 -0
  34. data/spec/parsers/aiff_parser_spec.rb +1 -0
  35. data/spec/parsers/bmp_parser_spec.rb +8 -0
  36. data/spec/parsers/cr2_parser_spec.rb +1 -0
  37. data/spec/parsers/dpx_parser_spec.rb +1 -0
  38. data/spec/parsers/flac_parser_spec.rb +1 -0
  39. data/spec/parsers/gif_parser_spec.rb +1 -0
  40. data/spec/parsers/jpeg_parser_spec.rb +1 -0
  41. data/spec/parsers/m3u_parser_spec.rb +41 -0
  42. data/spec/parsers/moov_parser_spec.rb +4 -1
  43. data/spec/parsers/mp3_parser_spec.rb +1 -0
  44. data/spec/parsers/ogg_parser_spec.rb +1 -0
  45. data/spec/parsers/pdf_parser_spec.rb +1 -0
  46. data/spec/parsers/png_parser_spec.rb +1 -0
  47. data/spec/parsers/psd_parser_spec.rb +1 -0
  48. data/spec/parsers/tiff_parser_spec.rb +1 -0
  49. data/spec/parsers/wav_parser_spec.rb +1 -0
  50. data/spec/parsers/zip_parser_spec.rb +2 -0
  51. data/spec/remote_fetching_spec.rb +11 -0
  52. data/spec/remote_io_spec.rb +38 -13
  53. metadata +21 -4
  54. data/.travis.yml +0 -12
@@ -32,7 +32,7 @@ class FormatParser::MP3Parser
32
32
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
33
33
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
34
34
  TIFF_HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
35
-
35
+ MP3_MIME_TYPE = 'audio/mpeg'
36
36
  # Wraps the Tag object returned by ID3Tag in such
37
37
  # a way that a usable JSON representation gets
38
38
  # returned
@@ -104,7 +104,8 @@ class FormatParser::MP3Parser
104
104
  # do not tell anything of substance
105
105
  num_audio_channels: first_frame.channels,
106
106
  audio_sample_rate_hz: first_frame.sample_rate,
107
- intrinsics: id3tags_hash.merge(id3tags: tags)
107
+ intrinsics: id3tags_hash.merge(id3tags: tags),
108
+ content_type: MP3_MIME_TYPE,
108
109
  )
109
110
 
110
111
  extra_file_attirbutes = fetch_extra_attributes_from_id3_tags(id3tags_hash)
@@ -3,8 +3,8 @@
3
3
  class FormatParser::OggParser
4
4
  include FormatParser::IOUtils
5
5
 
6
- # Maximum size of an Ogg page
7
6
  MAX_POSSIBLE_PAGE_SIZE = 65307
7
+ OGG_MIME_TYPE = 'audio/ogg'
8
8
 
9
9
  def likely_match?(filename)
10
10
  filename =~ /\.ogg$/i
@@ -45,7 +45,8 @@ class FormatParser::OggParser
45
45
  format: :ogg,
46
46
  audio_sample_rate_hz: sample_rate,
47
47
  num_audio_channels: channels,
48
- media_duration_seconds: duration
48
+ media_duration_seconds: duration,
49
+ content_type: OGG_MIME_TYPE,
49
50
  )
50
51
  end
51
52
 
@@ -1,6 +1,5 @@
1
1
  class FormatParser::PDFParser
2
2
  include FormatParser::IOUtils
3
-
4
3
  # First 9 bytes of a PDF should be in this format, according to:
5
4
  #
6
5
  # https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
@@ -8,6 +7,7 @@ class FormatParser::PDFParser
8
7
  # There are however exceptions, which are left out for now.
9
8
  #
10
9
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
10
+ PDF_CONTENT_TYPE = 'application/pdf'
11
11
 
12
12
  def likely_match?(filename)
13
13
  filename =~ /\.(pdf|ai)$/i
@@ -18,7 +18,7 @@ class FormatParser::PDFParser
18
18
 
19
19
  return unless safe_read(io, 9) =~ PDF_MARKER
20
20
 
21
- FormatParser::Document.new(format: :pdf)
21
+ FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
22
22
  end
23
23
 
24
24
  FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
@@ -14,6 +14,7 @@ class FormatParser::PNGParser
14
14
  4 => true, # Grayscale with alpha
15
15
  6 => true,
16
16
  }
17
+ PNG_MIME_TYPE = 'image/png'
17
18
 
18
19
  def likely_match?(filename)
19
20
  filename =~ /\.png$/i
@@ -67,6 +68,7 @@ class FormatParser::PNGParser
67
68
  color_mode: color_mode,
68
69
  has_multiple_frames: has_animation,
69
70
  num_animation_or_video_frames: num_frames,
71
+ content_type: PNG_MIME_TYPE,
70
72
  )
71
73
  end
72
74
 
@@ -2,6 +2,7 @@ class FormatParser::PSDParser
2
2
  include FormatParser::IOUtils
3
3
 
4
4
  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
5
+ PSD_MIME_TYPE = 'application/x-photoshop'
5
6
 
6
7
  def likely_match?(filename)
7
8
  filename =~ /\.psd$/i # Maybe also PSB at some point
@@ -20,6 +21,7 @@ class FormatParser::PSDParser
20
21
  format: :psd,
21
22
  width_px: w,
22
23
  height_px: h,
24
+ content_type: PSD_MIME_TYPE,
23
25
  )
24
26
  end
25
27
 
@@ -5,6 +5,8 @@ class FormatParser::TIFFParser
5
5
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
6
6
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
7
7
  HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
8
+ TIFF_MIME_TYPE = 'image/tiff'
9
+ ARW_MIME_TYPE = 'image/x-sony-arw'
8
10
 
9
11
  def likely_match?(filename)
10
12
  filename =~ /\.tiff?$/i
@@ -14,7 +16,10 @@ class FormatParser::TIFFParser
14
16
  io = FormatParser::IOConstraint.new(io)
15
17
 
16
18
  return unless HEADER_BYTES.include?(safe_read(io, 4))
17
- io.seek(io.pos + 2) # Skip over the offset of the IFD, EXIFR will re-read it anyway
19
+
20
+ # Skip over the offset of the IFD,
21
+ # EXIFR will re-read it anyway
22
+ io.seek(io.pos + 2)
18
23
  return if cr2?(io)
19
24
 
20
25
  # The TIFF scanner in EXIFR is plenty good enough,
@@ -26,14 +31,17 @@ class FormatParser::TIFFParser
26
31
  w = exif_data.width || exif_data.pixel_x_dimension
27
32
  h = exif_data.height || exif_data.pixel_y_dimension
28
33
 
34
+ format = arw?(exif_data) ? :arw : :tif
35
+ mime_type = arw?(exif_data) ? ARW_MIME_TYPE : TIFF_MIME_TYPE
29
36
  FormatParser::Image.new(
30
- format: arw?(exif_data) ? :arw : :tif, # Specify format as arw for Sony ARW format images, else tif
37
+ format: format,
31
38
  width_px: w,
32
39
  height_px: h,
33
40
  display_width_px: exif_data.rotated? ? h : w,
34
41
  display_height_px: exif_data.rotated? ? w : h,
35
42
  orientation: exif_data.orientation_sym,
36
43
  intrinsics: {exif: exif_data},
44
+ content_type: mime_type,
37
45
  )
38
46
  rescue EXIFR::MalformedTIFF
39
47
  nil
@@ -1,6 +1,8 @@
1
1
  class FormatParser::WAVParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ WAV_MIME_TYPE = 'audio/x-wav'
5
+
4
6
  def likely_match?(filename)
5
7
  filename =~ /\.wav$/i
6
8
  end
@@ -96,6 +98,7 @@ class FormatParser::WAVParser
96
98
  audio_sample_rate_hz: fmt_data[:sample_rate],
97
99
  media_duration_frames: sample_frames,
98
100
  media_duration_seconds: duration_in_seconds,
101
+ content_type: WAV_MIME_TYPE,
99
102
  )
100
103
  end
101
104
 
@@ -5,6 +5,8 @@ class FormatParser::ZIPParser
5
5
  include OfficeFormats
6
6
  include FormatParser::IOUtils
7
7
 
8
+ ZIP_MIME_TYPE = 'application/zip'
9
+
8
10
  def likely_match?(filename)
9
11
  filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
10
12
  end
@@ -25,10 +27,10 @@ class FormatParser::ZIPParser
25
27
  end
26
28
 
27
29
  if office_document?(filenames_set)
28
- office_format = office_file_format_from_entry_set(filenames_set)
29
- FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
30
+ office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
31
+ FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
30
32
  else
31
- FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
33
+ FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
32
34
  end
33
35
  rescue FileReader::Error
34
36
  # This is not a ZIP, or a broken ZIP.
@@ -37,15 +37,15 @@ module FormatParser::ZIPParser::OfficeFormats
37
37
  OFFICE_MARKER_FILES.subset?(filenames_set)
38
38
  end
39
39
 
40
- def office_file_format_from_entry_set(filenames_set)
40
+ def office_file_format_and_mime_type_from_entry_set(filenames_set)
41
41
  if filenames_set.include?('word/document.xml')
42
- :docx
42
+ [:docx, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
43
43
  elsif filenames_set.include?('xl/workbook.xml')
44
- :xlsx
44
+ [:xlsx, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
45
45
  elsif filenames_set.include?('ppt/presentation.xml')
46
- :pptx
46
+ [:pptx, 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
47
47
  else
48
- :unknown
48
+ [:unknown, 'application/zip']
49
49
  end
50
50
  end
51
51
  end
data/lib/remote_io.rb CHANGED
@@ -26,6 +26,7 @@ class FormatParser::RemoteIO
26
26
  # @param uri[URI, String] the remote URL to obtain
27
27
  def initialize(uri)
28
28
  require 'faraday'
29
+ require 'faraday_middleware/response/follow_redirects'
29
30
  @uri = uri
30
31
  @pos = 0
31
32
  @remote_size = false
@@ -78,7 +79,12 @@ class FormatParser::RemoteIO
78
79
  # We use a GET and not a HEAD request followed by a GET because
79
80
  # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
80
81
  # combine the first GET of a segment and retrieving the size of the resource
81
- response = Faraday.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
82
+ conn = Faraday.new do |faraday|
83
+ faraday.use FaradayMiddleware::FollowRedirects
84
+ # we still need the default adapter, more details: https://blog.thecodewhisperer.com/permalink/losing-time-to-faraday
85
+ faraday.adapter Faraday.default_adapter
86
+ end
87
+ response = conn.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
82
88
 
83
89
  case response.status
84
90
  when 200, 206
data/lib/text.rb ADDED
@@ -0,0 +1,19 @@
1
+ module FormatParser
2
+ class Text
3
+ include FormatParser::AttributesJSON
4
+
5
+ NATURE = :text
6
+
7
+ attr_accessor :format
8
+ attr_accessor :content_type
9
+
10
+ # Only permits assignments via defined accessors
11
+ def initialize(**attributes)
12
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
13
+ end
14
+
15
+ def nature
16
+ NATURE
17
+ end
18
+ end
19
+ end
data/lib/video.rb CHANGED
@@ -23,6 +23,9 @@ module FormatParser
23
23
  # it can be placed here
24
24
  attr_accessor :intrinsics
25
25
 
26
+ # The MIME type of the video
27
+ attr_accessor :content_type
28
+
26
29
  # Only permits assignments via defined accessors
27
30
  def initialize(**attributes)
28
31
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -173,6 +173,26 @@ describe FormatParser do
173
173
  prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
174
174
  expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
175
175
  end
176
+
177
+ it 'sorts the parsers by priority and name' do
178
+ parsers = FormatParser.parsers_for(
179
+ [:audio, :image],
180
+ [:cr2, :dpx, :fdx, :flac, :gif, :jpg, :mov, :mp4, :m4a, :mp3, :mpg, :mpeg, :ogg, :png, :tif, :wav]
181
+ )
182
+
183
+ expect(parsers.map { |parser| parser.class.name }).to eq([
184
+ 'FormatParser::GIFParser',
185
+ 'Class',
186
+ 'FormatParser::PNGParser',
187
+ 'FormatParser::CR2Parser',
188
+ 'FormatParser::DPXParser',
189
+ 'FormatParser::FLACParser',
190
+ 'FormatParser::MP3Parser',
191
+ 'FormatParser::OggParser',
192
+ 'FormatParser::TIFFParser',
193
+ 'FormatParser::WAVParser'
194
+ ])
195
+ end
176
196
  end
177
197
 
178
198
  describe '.register_parser and .deregister_parser' do
@@ -10,6 +10,7 @@ describe FormatParser::AIFFParser do
10
10
  expect(parse_result.num_audio_channels).to eq(2)
11
11
  expect(parse_result.audio_sample_rate_hz).to be_within(0.01).of(44100)
12
12
  expect(parse_result.media_duration_seconds).to be_within(0.01).of(1.05)
13
+ expect(parse_result.content_type).to eq('audio/x-aiff')
13
14
  end
14
15
 
15
16
  it 'parses a Logic Pro created AIFF sample file having a COMT chunk before a COMM chunk' do
@@ -13,6 +13,8 @@ describe FormatParser::BMPParser do
13
13
  expect(parsed.width_px).to eq(40)
14
14
  expect(parsed.height_px).to eq(27)
15
15
 
16
+ expect(parsed.content_type).to eq('image/bmp')
17
+
16
18
  expect(parsed.intrinsics).not_to be_nil
17
19
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2834)
18
20
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2834)
@@ -32,6 +34,8 @@ describe FormatParser::BMPParser do
32
34
  expect(parsed.width_px).to eq(1920)
33
35
  expect(parsed.height_px).to eq(1080)
34
36
 
37
+ expect(parsed.content_type).to eq('image/bmp')
38
+
35
39
  expect(parsed.intrinsics).not_to be_nil
36
40
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2835)
37
41
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2835)
@@ -51,6 +55,8 @@ describe FormatParser::BMPParser do
51
55
  expect(parsed.width_px).to eq(200)
52
56
  expect(parsed.height_px).to eq(200)
53
57
 
58
+ expect(parsed.content_type).to eq('image/bmp')
59
+
54
60
  expect(parsed.intrinsics).not_to be_nil
55
61
  end
56
62
 
@@ -64,6 +70,7 @@ describe FormatParser::BMPParser do
64
70
  expect(parsed.color_mode).to eq(:rgb)
65
71
  expect(parsed.width_px).to eq(40)
66
72
  expect(parsed.height_px).to eq(27)
73
+ expect(parsed.content_type).to eq('image/bmp')
67
74
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
68
75
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
69
76
 
@@ -76,6 +83,7 @@ describe FormatParser::BMPParser do
76
83
  expect(parsed.color_mode).to eq(:rgb)
77
84
  expect(parsed.width_px).to eq(40)
78
85
  expect(parsed.height_px).to eq(27)
86
+ expect(parsed.content_type).to eq('image/bmp')
79
87
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
80
88
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
81
89
  end
@@ -17,6 +17,7 @@ describe FormatParser::CR2Parser do
17
17
  expect(parsed.height_px).to be > 0
18
18
 
19
19
  expect(parsed.orientation).not_to be_nil
20
+ expect(parsed.content_type).to eq('image/x-canon-cr2')
20
21
  end
21
22
  end
22
23
  end
@@ -15,6 +15,7 @@ describe FormatParser::DPXParser do
15
15
  expect(parsed.width_px).to be_between(0, 2048)
16
16
  expect(parsed.height_px).to be_kind_of(Integer)
17
17
  expect(parsed.height_px).to be_between(0, 4000)
18
+ expect(parsed.content_type).to eq('image/x-dpx')
18
19
  end
19
20
  end
20
21
 
@@ -14,6 +14,7 @@ describe FormatParser::FLACParser do
14
14
  expect(parsed.intrinsics).not_to be_nil
15
15
  expect(parsed.media_duration_frames).to eq(33810)
16
16
  expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
17
+ expect(parsed.content_type).to eq('audio/x-flac')
17
18
  end
18
19
 
19
20
  it 'decodes and estimates duration for the 16bit FLAC File' do
@@ -17,6 +17,7 @@ describe FormatParser::GIFParser do
17
17
 
18
18
  expect(parsed.height_px).to be_kind_of(Integer)
19
19
  expect(parsed.height_px).to be > 0
20
+ expect(parsed.content_type).to eq('image/gif')
20
21
  end
21
22
  end
22
23
  end
@@ -14,6 +14,7 @@ describe FormatParser::JPEGParser do
14
14
 
15
15
  expect(parsed.height_px).to be_kind_of(Integer)
16
16
  expect(parsed.height_px).to be > 0
17
+ expect(parsed.content_type).to eq('image/jpeg')
17
18
  end
18
19
  end
19
20
  end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::M3UParser do
4
+ let(:parsed_m3u) do
5
+ subject.call(
6
+ File.open(
7
+ Pathname.new(fixtures_dir).join('M3U').join(m3u_file),
8
+ 'rb'
9
+ )
10
+ )
11
+ end
12
+
13
+ describe 'an m3u file with missing header' do
14
+ let(:m3u_file) { 'plain_text.m3u' }
15
+
16
+ it 'does not parse the file successfully' do
17
+ expect(parsed_m3u).to be_nil
18
+ end
19
+ end
20
+
21
+ describe 'an m3u file with valid header' do
22
+ let(:m3u_file) { 'sample.m3u' }
23
+
24
+ it 'parses the file successfully' do
25
+ expect(parsed_m3u).not_to be_nil
26
+ expect(parsed_m3u.nature).to eq(:text)
27
+ expect(parsed_m3u.format).to eq(:m3u)
28
+ expect(parsed_m3u.content_type).to eq('application/vnd.apple.mpegurl')
29
+ end
30
+ end
31
+
32
+ describe 'an m3u8 file with valid header' do
33
+ let(:m3u_file) { 'sample.m3u8' }
34
+
35
+ it 'parses the file successfully' do
36
+ expect(parsed_m3u).not_to be_nil
37
+ expect(parsed_m3u.nature).to eq(:text)
38
+ expect(parsed_m3u.format).to eq(:m3u)
39
+ end
40
+ end
41
+ end
@@ -37,7 +37,7 @@ describe FormatParser::MOOVParser do
37
37
  expect(result.nature).to eq(:audio)
38
38
  expect(result.media_duration_seconds).to be_kind_of(Float)
39
39
  expect(result.media_duration_seconds).to be > 0
40
-
40
+ expect(result.content_type).to be_kind_of(String)
41
41
  expect(result.intrinsics).not_to be_nil
42
42
  end
43
43
  end
@@ -52,6 +52,7 @@ describe FormatParser::MOOVParser do
52
52
  expect(result.height_px).to be > 0
53
53
  expect(result.media_duration_seconds).to be_kind_of(Float)
54
54
  expect(result.media_duration_seconds).to be > 0
55
+ expect(result.content_type).to eq('video/mp4')
55
56
 
56
57
  expect(result.intrinsics).not_to be_nil
57
58
  end
@@ -67,6 +68,7 @@ describe FormatParser::MOOVParser do
67
68
  expect(result.height_px).to be > 0
68
69
  expect(result.media_duration_seconds).to be_kind_of(Float)
69
70
  expect(result.media_duration_seconds).to be > 0
71
+ expect(result.content_type).to eq('video/mp4')
70
72
 
71
73
  expect(result.intrinsics).not_to be_nil
72
74
  end
@@ -79,6 +81,7 @@ describe FormatParser::MOOVParser do
79
81
  expect(result).not_to be_nil
80
82
  expect(result.nature).to eq(:audio)
81
83
  expect(result.format).to eq(:m4a)
84
+ expect(result.content_type).to eq('audio/mp4')
82
85
  end
83
86
 
84
87
  it 'parses a MOV file and provides the necessary metadata' do