format_parser 0.25.4 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +104 -0
  3. data/CHANGELOG.md +15 -0
  4. data/README.md +4 -0
  5. data/format_parser.gemspec +1 -0
  6. data/lib/archive.rb +3 -0
  7. data/lib/audio.rb +3 -0
  8. data/lib/document.rb +1 -0
  9. data/lib/format_parser.rb +18 -3
  10. data/lib/format_parser/version.rb +1 -1
  11. data/lib/image.rb +3 -0
  12. data/lib/parsers/aiff_parser.rb +4 -1
  13. data/lib/parsers/bmp_parser.rb +3 -0
  14. data/lib/parsers/cr2_parser.rb +2 -0
  15. data/lib/parsers/dpx_parser.rb +19 -8
  16. data/lib/parsers/flac_parser.rb +2 -0
  17. data/lib/parsers/gif_parser.rb +2 -0
  18. data/lib/parsers/jpeg_parser.rb +2 -0
  19. data/lib/parsers/m3u_parser.rb +23 -0
  20. data/lib/parsers/moov_parser.rb +10 -1
  21. data/lib/parsers/mp3_parser.rb +3 -2
  22. data/lib/parsers/ogg_parser.rb +3 -2
  23. data/lib/parsers/pdf_parser.rb +2 -2
  24. data/lib/parsers/png_parser.rb +2 -0
  25. data/lib/parsers/psd_parser.rb +2 -0
  26. data/lib/parsers/tiff_parser.rb +10 -2
  27. data/lib/parsers/wav_parser.rb +3 -0
  28. data/lib/parsers/zip_parser.rb +5 -3
  29. data/lib/parsers/zip_parser/office_formats.rb +5 -5
  30. data/lib/remote_io.rb +7 -1
  31. data/lib/text.rb +19 -0
  32. data/lib/video.rb +3 -0
  33. data/spec/format_parser_spec.rb +20 -0
  34. data/spec/parsers/aiff_parser_spec.rb +1 -0
  35. data/spec/parsers/bmp_parser_spec.rb +8 -0
  36. data/spec/parsers/cr2_parser_spec.rb +1 -0
  37. data/spec/parsers/dpx_parser_spec.rb +1 -0
  38. data/spec/parsers/flac_parser_spec.rb +1 -0
  39. data/spec/parsers/gif_parser_spec.rb +1 -0
  40. data/spec/parsers/jpeg_parser_spec.rb +1 -0
  41. data/spec/parsers/m3u_parser_spec.rb +41 -0
  42. data/spec/parsers/moov_parser_spec.rb +4 -1
  43. data/spec/parsers/mp3_parser_spec.rb +1 -0
  44. data/spec/parsers/ogg_parser_spec.rb +1 -0
  45. data/spec/parsers/pdf_parser_spec.rb +1 -0
  46. data/spec/parsers/png_parser_spec.rb +1 -0
  47. data/spec/parsers/psd_parser_spec.rb +1 -0
  48. data/spec/parsers/tiff_parser_spec.rb +1 -0
  49. data/spec/parsers/wav_parser_spec.rb +1 -0
  50. data/spec/parsers/zip_parser_spec.rb +2 -0
  51. data/spec/remote_fetching_spec.rb +11 -0
  52. data/spec/remote_io_spec.rb +38 -13
  53. metadata +21 -4
  54. data/.travis.yml +0 -12
@@ -32,7 +32,7 @@ class FormatParser::MP3Parser
32
32
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
33
33
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
34
34
  TIFF_HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
35
-
35
+ MP3_MIME_TYPE = 'audio/mpeg'
36
36
  # Wraps the Tag object returned by ID3Tag in such
37
37
  # a way that a usable JSON representation gets
38
38
  # returned
@@ -104,7 +104,8 @@ class FormatParser::MP3Parser
104
104
  # do not tell anything of substance
105
105
  num_audio_channels: first_frame.channels,
106
106
  audio_sample_rate_hz: first_frame.sample_rate,
107
- intrinsics: id3tags_hash.merge(id3tags: tags)
107
+ intrinsics: id3tags_hash.merge(id3tags: tags),
108
+ content_type: MP3_MIME_TYPE,
108
109
  )
109
110
 
110
111
  extra_file_attirbutes = fetch_extra_attributes_from_id3_tags(id3tags_hash)
@@ -3,8 +3,8 @@
3
3
  class FormatParser::OggParser
4
4
  include FormatParser::IOUtils
5
5
 
6
- # Maximum size of an Ogg page
7
6
  MAX_POSSIBLE_PAGE_SIZE = 65307
7
+ OGG_MIME_TYPE = 'audio/ogg'
8
8
 
9
9
  def likely_match?(filename)
10
10
  filename =~ /\.ogg$/i
@@ -45,7 +45,8 @@ class FormatParser::OggParser
45
45
  format: :ogg,
46
46
  audio_sample_rate_hz: sample_rate,
47
47
  num_audio_channels: channels,
48
- media_duration_seconds: duration
48
+ media_duration_seconds: duration,
49
+ content_type: OGG_MIME_TYPE,
49
50
  )
50
51
  end
51
52
 
@@ -1,6 +1,5 @@
1
1
  class FormatParser::PDFParser
2
2
  include FormatParser::IOUtils
3
-
4
3
  # First 9 bytes of a PDF should be in this format, according to:
5
4
  #
6
5
  # https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
@@ -8,6 +7,7 @@ class FormatParser::PDFParser
8
7
  # There are however exceptions, which are left out for now.
9
8
  #
10
9
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
10
+ PDF_CONTENT_TYPE = 'application/pdf'
11
11
 
12
12
  def likely_match?(filename)
13
13
  filename =~ /\.(pdf|ai)$/i
@@ -18,7 +18,7 @@ class FormatParser::PDFParser
18
18
 
19
19
  return unless safe_read(io, 9) =~ PDF_MARKER
20
20
 
21
- FormatParser::Document.new(format: :pdf)
21
+ FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
22
22
  end
23
23
 
24
24
  FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
@@ -14,6 +14,7 @@ class FormatParser::PNGParser
14
14
  4 => true, # Grayscale with alpha
15
15
  6 => true,
16
16
  }
17
+ PNG_MIME_TYPE = 'image/png'
17
18
 
18
19
  def likely_match?(filename)
19
20
  filename =~ /\.png$/i
@@ -67,6 +68,7 @@ class FormatParser::PNGParser
67
68
  color_mode: color_mode,
68
69
  has_multiple_frames: has_animation,
69
70
  num_animation_or_video_frames: num_frames,
71
+ content_type: PNG_MIME_TYPE,
70
72
  )
71
73
  end
72
74
 
@@ -2,6 +2,7 @@ class FormatParser::PSDParser
2
2
  include FormatParser::IOUtils
3
3
 
4
4
  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
5
+ PSD_MIME_TYPE = 'application/x-photoshop'
5
6
 
6
7
  def likely_match?(filename)
7
8
  filename =~ /\.psd$/i # Maybe also PSB at some point
@@ -20,6 +21,7 @@ class FormatParser::PSDParser
20
21
  format: :psd,
21
22
  width_px: w,
22
23
  height_px: h,
24
+ content_type: PSD_MIME_TYPE,
23
25
  )
24
26
  end
25
27
 
@@ -5,6 +5,8 @@ class FormatParser::TIFFParser
5
5
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
6
6
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
7
7
  HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
8
+ TIFF_MIME_TYPE = 'image/tiff'
9
+ ARW_MIME_TYPE = 'image/x-sony-arw'
8
10
 
9
11
  def likely_match?(filename)
10
12
  filename =~ /\.tiff?$/i
@@ -14,7 +16,10 @@ class FormatParser::TIFFParser
14
16
  io = FormatParser::IOConstraint.new(io)
15
17
 
16
18
  return unless HEADER_BYTES.include?(safe_read(io, 4))
17
- io.seek(io.pos + 2) # Skip over the offset of the IFD, EXIFR will re-read it anyway
19
+
20
+ # Skip over the offset of the IFD,
21
+ # EXIFR will re-read it anyway
22
+ io.seek(io.pos + 2)
18
23
  return if cr2?(io)
19
24
 
20
25
  # The TIFF scanner in EXIFR is plenty good enough,
@@ -26,14 +31,17 @@ class FormatParser::TIFFParser
26
31
  w = exif_data.width || exif_data.pixel_x_dimension
27
32
  h = exif_data.height || exif_data.pixel_y_dimension
28
33
 
34
+ format = arw?(exif_data) ? :arw : :tif
35
+ mime_type = arw?(exif_data) ? ARW_MIME_TYPE : TIFF_MIME_TYPE
29
36
  FormatParser::Image.new(
30
- format: arw?(exif_data) ? :arw : :tif, # Specify format as arw for Sony ARW format images, else tif
37
+ format: format,
31
38
  width_px: w,
32
39
  height_px: h,
33
40
  display_width_px: exif_data.rotated? ? h : w,
34
41
  display_height_px: exif_data.rotated? ? w : h,
35
42
  orientation: exif_data.orientation_sym,
36
43
  intrinsics: {exif: exif_data},
44
+ content_type: mime_type,
37
45
  )
38
46
  rescue EXIFR::MalformedTIFF
39
47
  nil
@@ -1,6 +1,8 @@
1
1
  class FormatParser::WAVParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ WAV_MIME_TYPE = 'audio/x-wav'
5
+
4
6
  def likely_match?(filename)
5
7
  filename =~ /\.wav$/i
6
8
  end
@@ -96,6 +98,7 @@ class FormatParser::WAVParser
96
98
  audio_sample_rate_hz: fmt_data[:sample_rate],
97
99
  media_duration_frames: sample_frames,
98
100
  media_duration_seconds: duration_in_seconds,
101
+ content_type: WAV_MIME_TYPE,
99
102
  )
100
103
  end
101
104
 
@@ -5,6 +5,8 @@ class FormatParser::ZIPParser
5
5
  include OfficeFormats
6
6
  include FormatParser::IOUtils
7
7
 
8
+ ZIP_MIME_TYPE = 'application/zip'
9
+
8
10
  def likely_match?(filename)
9
11
  filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
10
12
  end
@@ -25,10 +27,10 @@ class FormatParser::ZIPParser
25
27
  end
26
28
 
27
29
  if office_document?(filenames_set)
28
- office_format = office_file_format_from_entry_set(filenames_set)
29
- FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
30
+ office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
31
+ FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
30
32
  else
31
- FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
33
+ FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
32
34
  end
33
35
  rescue FileReader::Error
34
36
  # This is not a ZIP, or a broken ZIP.
@@ -37,15 +37,15 @@ module FormatParser::ZIPParser::OfficeFormats
37
37
  OFFICE_MARKER_FILES.subset?(filenames_set)
38
38
  end
39
39
 
40
- def office_file_format_from_entry_set(filenames_set)
40
+ def office_file_format_and_mime_type_from_entry_set(filenames_set)
41
41
  if filenames_set.include?('word/document.xml')
42
- :docx
42
+ [:docx, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
43
43
  elsif filenames_set.include?('xl/workbook.xml')
44
- :xlsx
44
+ [:xlsx, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
45
45
  elsif filenames_set.include?('ppt/presentation.xml')
46
- :pptx
46
+ [:pptx, 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
47
47
  else
48
- :unknown
48
+ [:unknown, 'application/zip']
49
49
  end
50
50
  end
51
51
  end
data/lib/remote_io.rb CHANGED
@@ -26,6 +26,7 @@ class FormatParser::RemoteIO
26
26
  # @param uri[URI, String] the remote URL to obtain
27
27
  def initialize(uri)
28
28
  require 'faraday'
29
+ require 'faraday_middleware/response/follow_redirects'
29
30
  @uri = uri
30
31
  @pos = 0
31
32
  @remote_size = false
@@ -78,7 +79,12 @@ class FormatParser::RemoteIO
78
79
  # We use a GET and not a HEAD request followed by a GET because
79
80
  # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
80
81
  # combine the first GET of a segment and retrieving the size of the resource
81
- response = Faraday.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
82
+ conn = Faraday.new do |faraday|
83
+ faraday.use FaradayMiddleware::FollowRedirects
84
+ # we still need the default adapter, more details: https://blog.thecodewhisperer.com/permalink/losing-time-to-faraday
85
+ faraday.adapter Faraday.default_adapter
86
+ end
87
+ response = conn.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
82
88
 
83
89
  case response.status
84
90
  when 200, 206
data/lib/text.rb ADDED
@@ -0,0 +1,19 @@
1
+ module FormatParser
2
+ class Text
3
+ include FormatParser::AttributesJSON
4
+
5
+ NATURE = :text
6
+
7
+ attr_accessor :format
8
+ attr_accessor :content_type
9
+
10
+ # Only permits assignments via defined accessors
11
+ def initialize(**attributes)
12
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
13
+ end
14
+
15
+ def nature
16
+ NATURE
17
+ end
18
+ end
19
+ end
data/lib/video.rb CHANGED
@@ -23,6 +23,9 @@ module FormatParser
23
23
  # it can be placed here
24
24
  attr_accessor :intrinsics
25
25
 
26
+ # The MIME type of the video
27
+ attr_accessor :content_type
28
+
26
29
  # Only permits assignments via defined accessors
27
30
  def initialize(**attributes)
28
31
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -173,6 +173,26 @@ describe FormatParser do
173
173
  prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
174
174
  expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
175
175
  end
176
+
177
+ it 'sorts the parsers by priority and name' do
178
+ parsers = FormatParser.parsers_for(
179
+ [:audio, :image],
180
+ [:cr2, :dpx, :fdx, :flac, :gif, :jpg, :mov, :mp4, :m4a, :mp3, :mpg, :mpeg, :ogg, :png, :tif, :wav]
181
+ )
182
+
183
+ expect(parsers.map { |parser| parser.class.name }).to eq([
184
+ 'FormatParser::GIFParser',
185
+ 'Class',
186
+ 'FormatParser::PNGParser',
187
+ 'FormatParser::CR2Parser',
188
+ 'FormatParser::DPXParser',
189
+ 'FormatParser::FLACParser',
190
+ 'FormatParser::MP3Parser',
191
+ 'FormatParser::OggParser',
192
+ 'FormatParser::TIFFParser',
193
+ 'FormatParser::WAVParser'
194
+ ])
195
+ end
176
196
  end
177
197
 
178
198
  describe '.register_parser and .deregister_parser' do
@@ -10,6 +10,7 @@ describe FormatParser::AIFFParser do
10
10
  expect(parse_result.num_audio_channels).to eq(2)
11
11
  expect(parse_result.audio_sample_rate_hz).to be_within(0.01).of(44100)
12
12
  expect(parse_result.media_duration_seconds).to be_within(0.01).of(1.05)
13
+ expect(parse_result.content_type).to eq('audio/x-aiff')
13
14
  end
14
15
 
15
16
  it 'parses a Logic Pro created AIFF sample file having a COMT chunk before a COMM chunk' do
@@ -13,6 +13,8 @@ describe FormatParser::BMPParser do
13
13
  expect(parsed.width_px).to eq(40)
14
14
  expect(parsed.height_px).to eq(27)
15
15
 
16
+ expect(parsed.content_type).to eq('image/bmp')
17
+
16
18
  expect(parsed.intrinsics).not_to be_nil
17
19
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2834)
18
20
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2834)
@@ -32,6 +34,8 @@ describe FormatParser::BMPParser do
32
34
  expect(parsed.width_px).to eq(1920)
33
35
  expect(parsed.height_px).to eq(1080)
34
36
 
37
+ expect(parsed.content_type).to eq('image/bmp')
38
+
35
39
  expect(parsed.intrinsics).not_to be_nil
36
40
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2835)
37
41
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2835)
@@ -51,6 +55,8 @@ describe FormatParser::BMPParser do
51
55
  expect(parsed.width_px).to eq(200)
52
56
  expect(parsed.height_px).to eq(200)
53
57
 
58
+ expect(parsed.content_type).to eq('image/bmp')
59
+
54
60
  expect(parsed.intrinsics).not_to be_nil
55
61
  end
56
62
 
@@ -64,6 +70,7 @@ describe FormatParser::BMPParser do
64
70
  expect(parsed.color_mode).to eq(:rgb)
65
71
  expect(parsed.width_px).to eq(40)
66
72
  expect(parsed.height_px).to eq(27)
73
+ expect(parsed.content_type).to eq('image/bmp')
67
74
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
68
75
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
69
76
 
@@ -76,6 +83,7 @@ describe FormatParser::BMPParser do
76
83
  expect(parsed.color_mode).to eq(:rgb)
77
84
  expect(parsed.width_px).to eq(40)
78
85
  expect(parsed.height_px).to eq(27)
86
+ expect(parsed.content_type).to eq('image/bmp')
79
87
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
80
88
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
81
89
  end
@@ -17,6 +17,7 @@ describe FormatParser::CR2Parser do
17
17
  expect(parsed.height_px).to be > 0
18
18
 
19
19
  expect(parsed.orientation).not_to be_nil
20
+ expect(parsed.content_type).to eq('image/x-canon-cr2')
20
21
  end
21
22
  end
22
23
  end
@@ -15,6 +15,7 @@ describe FormatParser::DPXParser do
15
15
  expect(parsed.width_px).to be_between(0, 2048)
16
16
  expect(parsed.height_px).to be_kind_of(Integer)
17
17
  expect(parsed.height_px).to be_between(0, 4000)
18
+ expect(parsed.content_type).to eq('image/x-dpx')
18
19
  end
19
20
  end
20
21
 
@@ -14,6 +14,7 @@ describe FormatParser::FLACParser do
14
14
  expect(parsed.intrinsics).not_to be_nil
15
15
  expect(parsed.media_duration_frames).to eq(33810)
16
16
  expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
17
+ expect(parsed.content_type).to eq('audio/x-flac')
17
18
  end
18
19
 
19
20
  it 'decodes and estimates duration for the 16bit FLAC File' do
@@ -17,6 +17,7 @@ describe FormatParser::GIFParser do
17
17
 
18
18
  expect(parsed.height_px).to be_kind_of(Integer)
19
19
  expect(parsed.height_px).to be > 0
20
+ expect(parsed.content_type).to eq('image/gif')
20
21
  end
21
22
  end
22
23
  end
@@ -14,6 +14,7 @@ describe FormatParser::JPEGParser do
14
14
 
15
15
  expect(parsed.height_px).to be_kind_of(Integer)
16
16
  expect(parsed.height_px).to be > 0
17
+ expect(parsed.content_type).to eq('image/jpeg')
17
18
  end
18
19
  end
19
20
  end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::M3UParser do
4
+ let(:parsed_m3u) do
5
+ subject.call(
6
+ File.open(
7
+ Pathname.new(fixtures_dir).join('M3U').join(m3u_file),
8
+ 'rb'
9
+ )
10
+ )
11
+ end
12
+
13
+ describe 'an m3u file with missing header' do
14
+ let(:m3u_file) { 'plain_text.m3u' }
15
+
16
+ it 'does not parse the file successfully' do
17
+ expect(parsed_m3u).to be_nil
18
+ end
19
+ end
20
+
21
+ describe 'an m3u file with valid header' do
22
+ let(:m3u_file) { 'sample.m3u' }
23
+
24
+ it 'parses the file successfully' do
25
+ expect(parsed_m3u).not_to be_nil
26
+ expect(parsed_m3u.nature).to eq(:text)
27
+ expect(parsed_m3u.format).to eq(:m3u)
28
+ expect(parsed_m3u.content_type).to eq('application/vnd.apple.mpegurl')
29
+ end
30
+ end
31
+
32
+ describe 'an m3u8 file with valid header' do
33
+ let(:m3u_file) { 'sample.m3u8' }
34
+
35
+ it 'parses the file successfully' do
36
+ expect(parsed_m3u).not_to be_nil
37
+ expect(parsed_m3u.nature).to eq(:text)
38
+ expect(parsed_m3u.format).to eq(:m3u)
39
+ end
40
+ end
41
+ end
@@ -37,7 +37,7 @@ describe FormatParser::MOOVParser do
37
37
  expect(result.nature).to eq(:audio)
38
38
  expect(result.media_duration_seconds).to be_kind_of(Float)
39
39
  expect(result.media_duration_seconds).to be > 0
40
-
40
+ expect(result.content_type).to be_kind_of(String)
41
41
  expect(result.intrinsics).not_to be_nil
42
42
  end
43
43
  end
@@ -52,6 +52,7 @@ describe FormatParser::MOOVParser do
52
52
  expect(result.height_px).to be > 0
53
53
  expect(result.media_duration_seconds).to be_kind_of(Float)
54
54
  expect(result.media_duration_seconds).to be > 0
55
+ expect(result.content_type).to eq('video/mp4')
55
56
 
56
57
  expect(result.intrinsics).not_to be_nil
57
58
  end
@@ -67,6 +68,7 @@ describe FormatParser::MOOVParser do
67
68
  expect(result.height_px).to be > 0
68
69
  expect(result.media_duration_seconds).to be_kind_of(Float)
69
70
  expect(result.media_duration_seconds).to be > 0
71
+ expect(result.content_type).to eq('video/mp4')
70
72
 
71
73
  expect(result.intrinsics).not_to be_nil
72
74
  end
@@ -79,6 +81,7 @@ describe FormatParser::MOOVParser do
79
81
  expect(result).not_to be_nil
80
82
  expect(result.nature).to eq(:audio)
81
83
  expect(result.format).to eq(:m4a)
84
+ expect(result.content_type).to eq('audio/mp4')
82
85
  end
83
86
 
84
87
  it 'parses a MOV file and provides the necessary metadata' do