format_parser 0.25.3 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -2
  3. data/CHANGELOG.md +16 -0
  4. data/README.md +4 -0
  5. data/lib/archive.rb +3 -0
  6. data/lib/audio.rb +3 -0
  7. data/lib/document.rb +1 -0
  8. data/lib/format_parser.rb +18 -3
  9. data/lib/format_parser/version.rb +1 -1
  10. data/lib/image.rb +3 -0
  11. data/lib/parsers/aiff_parser.rb +4 -1
  12. data/lib/parsers/bmp_parser.rb +3 -0
  13. data/lib/parsers/cr2_parser.rb +2 -0
  14. data/lib/parsers/dpx_parser.rb +19 -8
  15. data/lib/parsers/flac_parser.rb +2 -0
  16. data/lib/parsers/gif_parser.rb +2 -0
  17. data/lib/parsers/jpeg_parser.rb +2 -0
  18. data/lib/parsers/m3u_parser.rb +23 -0
  19. data/lib/parsers/moov_parser.rb +10 -1
  20. data/lib/parsers/mp3_parser.rb +9 -1
  21. data/lib/parsers/ogg_parser.rb +3 -2
  22. data/lib/parsers/pdf_parser.rb +2 -2
  23. data/lib/parsers/png_parser.rb +2 -0
  24. data/lib/parsers/psd_parser.rb +2 -0
  25. data/lib/parsers/tiff_parser.rb +12 -3
  26. data/lib/parsers/wav_parser.rb +3 -0
  27. data/lib/parsers/zip_parser.rb +5 -3
  28. data/lib/parsers/zip_parser/office_formats.rb +5 -5
  29. data/lib/text.rb +19 -0
  30. data/lib/video.rb +3 -0
  31. data/spec/format_parser_spec.rb +20 -0
  32. data/spec/parsers/aiff_parser_spec.rb +1 -0
  33. data/spec/parsers/bmp_parser_spec.rb +8 -0
  34. data/spec/parsers/cr2_parser_spec.rb +1 -0
  35. data/spec/parsers/dpx_parser_spec.rb +1 -0
  36. data/spec/parsers/flac_parser_spec.rb +1 -0
  37. data/spec/parsers/gif_parser_spec.rb +1 -0
  38. data/spec/parsers/jpeg_parser_spec.rb +1 -0
  39. data/spec/parsers/m3u_parser_spec.rb +41 -0
  40. data/spec/parsers/moov_parser_spec.rb +4 -1
  41. data/spec/parsers/mp3_parser_spec.rb +9 -0
  42. data/spec/parsers/ogg_parser_spec.rb +1 -0
  43. data/spec/parsers/pdf_parser_spec.rb +1 -0
  44. data/spec/parsers/png_parser_spec.rb +1 -0
  45. data/spec/parsers/psd_parser_spec.rb +1 -0
  46. data/spec/parsers/tiff_parser_spec.rb +1 -0
  47. data/spec/parsers/wav_parser_spec.rb +1 -0
  48. data/spec/parsers/zip_parser_spec.rb +2 -0
  49. metadata +6 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2d0b2c07289221019c42f9546eee65b4ccd5c49aadc3c16f7e4192f356821bcb
4
- data.tar.gz: 7163ca3bfac79fe5539979e5723902db8fe530ee721d09ba3b18cf635280ece6
3
+ metadata.gz: 1a10ceeaca4d0d6d2336b94f9fc397781ae2ffabdb588cee7ebc59fdcb968082
4
+ data.tar.gz: c28b8b7a0eb1d83e9f93406a4bbbad0699e50248d49d413393d6c3a6d82f7acf
5
5
  SHA512:
6
- metadata.gz: dfb72a909878a032d6f832aa0a52bd0521df31faacff2f5d070e9b4f555d420ad11abecfc30e2074f7897250b530543a4892b60d37037f67b87dd8c9b10c2b8b
7
- data.tar.gz: 24b69b8ad67b5d4461f63055b379af270a936172660d6350cc98ee2caa2c47ff3b7d94bf353de673ba331ba273b1e86090f83ef282933f261978250562aa22cd
6
+ metadata.gz: 6b3780e0f615f8a42aa097b652b675ceba5fb2325c3f2ad7472e178204cef3838967ea56c25e0daa1f5c666df42c5f5d4252a4dd42dae26389c41288d5b56d30
7
+ data.tar.gz: f6f22e664f8603e691795b44902cc5d677f63a02b4950233ea09719b9b2e8ae7d9eea57e406bd4f5b9e252aff8e9c5a684cdb1155846f0eedeceeba7fe7131c0
@@ -2,8 +2,9 @@ rvm:
2
2
  - 2.2.10
3
3
  - 2.3.8
4
4
  - 2.4.9
5
- - 2.5.7
6
- - 2.6.5
5
+ - 2.5.8
6
+ - 2.6.6
7
+ - 2.7.2
7
8
  - jruby
8
9
  sudo: false
9
10
  cache: bundler
@@ -1,3 +1,19 @@
1
+ ## 0.27.0
2
+ * Add `#content_type` on `Result` return values which makes sense for the detected filetype
3
+
4
+ ## 0.26.0
5
+ * Add support for M3U format files
6
+
7
+ ## 0.25.6
8
+ * Fix FormatParser.parse (with `results: :first`) to be deterministic
9
+
10
+ ## 0.25.5
11
+ * DPX: Fix DPXParser to support images without aspect ratio
12
+
13
+ ## 0.25.4
14
+ * MP3: Fix MP3Parser to return nil for TIFF files
15
+ * Add support to ruby 2.7
16
+
1
17
  ## 0.25.3
2
18
  * MP3: Fix parser to not skip the first bytes if it's not an ID3 header
3
19
 
data/README.md CHANGED
@@ -32,6 +32,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
32
32
  * DOCX, PPTX, XLSX
33
33
  * OGG
34
34
  * MPEG, MPG
35
+ * M3U
35
36
 
36
37
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
37
38
 
@@ -194,6 +195,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
194
195
  manipulated using the [https://github.com/recurser/exif-orientation-examples](exif-orientation-examples)
195
196
  script.
196
197
 
198
+ ### M3U
199
+ - The M3U fixture files were created by one of the project maintainers
200
+
197
201
  ### .key
198
202
  - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
199
203
 
@@ -26,6 +26,9 @@ module FormatParser
26
26
  # it can be placed here
27
27
  attr_accessor :intrinsics
28
28
 
29
+ # The MIME type of the archive
30
+ attr_accessor :content_type
31
+
29
32
  # Only permits assignments via defined accessors
30
33
  def initialize(**attributes)
31
34
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -35,6 +35,9 @@ module FormatParser
35
35
  # it can be placed here
36
36
  attr_accessor :intrinsics
37
37
 
38
+ # The MIME type of the sound file
39
+ attr_accessor :content_type
40
+
38
41
  # Only permits assignments via defined accessors
39
42
  def initialize(**attributes)
40
43
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -7,6 +7,7 @@ module FormatParser
7
7
  attr_accessor :format
8
8
  attr_accessor :document_type
9
9
  attr_accessor :page_count
10
+ attr_accessor :content_type
10
11
 
11
12
  # Only permits assignments via defined accessors
12
13
  def initialize(**attributes)
@@ -19,6 +19,7 @@ module FormatParser
19
19
  require_relative 'io_constraint'
20
20
  require_relative 'care'
21
21
  require_relative 'active_storage/blob_analyzer'
22
+ require_relative 'text'
22
23
 
23
24
  # Define Measurometer in the internal namespace as well
24
25
  # so that we stay compatible for the applications that use it
@@ -49,8 +50,10 @@ module FormatParser
49
50
  parser_provided_formats = Array(formats)
50
51
  parser_provided_natures = Array(natures)
51
52
  PARSER_MUX.synchronize do
52
- @parsers ||= Set.new
53
- @parsers << callable_parser
53
+ # It can't be a Set because the method `parsers_for` depends on the order
54
+ # that the parsers were added.
55
+ @parsers ||= []
56
+ @parsers << callable_parser unless @parsers.include?(callable_parser)
54
57
  @parsers_per_nature ||= {}
55
58
  parser_provided_natures.each do |provided_nature|
56
59
  @parsers_per_nature[provided_nature] ||= Set.new
@@ -255,7 +258,19 @@ module FormatParser
255
258
  # Order the parsers according to their priority value. The ones having a lower
256
259
  # value will sort higher and will be applied sooner
257
260
  parsers_in_order_of_priority = parsers.to_a.sort do |parser_a, parser_b|
258
- @parser_priorities[parser_a] <=> @parser_priorities[parser_b]
261
+ if @parser_priorities[parser_a] != @parser_priorities[parser_b]
262
+ @parser_priorities[parser_a] <=> @parser_priorities[parser_b]
263
+ else
264
+ # Some parsers have the same priority and we want them to be always sorted
265
+ # in the same way, to not change the result of FormatParser.parse(results: :first).
266
+ # When this changes, it can generate flaky tests or event different
267
+ # results in different environments, which can be hard to understand why.
268
+ # There is also no guarantee in the order that the elements are added in
269
+ # @@parser_priorities
270
+ # So, to have always the same order, we sort by the order that the parsers
271
+ # were registered if the priorities are the same.
272
+ @parsers.index(parser_a) <=> @parsers.index(parser_b)
273
+ end
259
274
  end
260
275
 
261
276
  # If there is one parser that is more likely to match, place it first
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.25.3'
2
+ VERSION = '0.27.0'
3
3
  end
@@ -64,6 +64,9 @@ module FormatParser
64
64
  # it can be placed here
65
65
  attr_accessor :intrinsics
66
66
 
67
+ # The MIME type of the image file
68
+ attr_accessor :content_type
69
+
67
70
  # Only permits assignments via defined accessors
68
71
  def initialize(**attributes)
69
72
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -1,6 +1,8 @@
1
1
  class FormatParser::AIFFParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ AIFF_MIME_TYPE = 'audio/x-aiff'
5
+
4
6
  # Known chunk types we can omit when parsing,
5
7
  # grossly lifted from http://www.muratnkonar.com/aiff/
6
8
  KNOWN_CHUNKS = [
@@ -70,7 +72,8 @@ class FormatParser::AIFFParser
70
72
  num_audio_channels: channels,
71
73
  audio_sample_rate_hz: sample_rate.to_i,
72
74
  media_duration_frames: sample_frames,
73
- media_duration_seconds: duration_in_seconds
75
+ media_duration_seconds: duration_in_seconds,
76
+ content_type: AIFF_MIME_TYPE,
74
77
  )
75
78
  end
76
79
 
@@ -5,6 +5,7 @@ class FormatParser::BMPParser
5
5
 
6
6
  VALID_BMP = 'BM'
7
7
  PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 26..512
8
+ BMP_MIME_TYPE = 'image/bmp'
8
9
 
9
10
  def likely_match?(filename)
10
11
  filename =~ /\.bmp$/i
@@ -42,6 +43,7 @@ class FormatParser::BMPParser
42
43
  width_px: width,
43
44
  height_px: height,
44
45
  color_mode: :rgb,
46
+ content_type: BMP_MIME_TYPE,
45
47
  intrinsics: {
46
48
  data_order: data_order,
47
49
  bits_per_pixel: bit_depth
@@ -63,6 +65,7 @@ class FormatParser::BMPParser
63
65
  width_px: width,
64
66
  height_px: height.abs,
65
67
  color_mode: :rgb,
68
+ content_type: BMP_MIME_TYPE,
66
69
  intrinsics: {
67
70
  vertical_resolution: vertical_res,
68
71
  horizontal_resolution: horizontal_res,
@@ -6,6 +6,7 @@ class FormatParser::CR2Parser
6
6
 
7
7
  TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
8
8
  CR2_HEADER = [0x43, 0x52, 0x02, 0x00]
9
+ CR2_MIME_TYPE = 'image/x-canon-cr2'
9
10
 
10
11
  def likely_match?(filename)
11
12
  filename =~ /\.cr2$/i
@@ -39,6 +40,7 @@ class FormatParser::CR2Parser
39
40
  display_height_px: exif_data.rotated? ? w : h,
40
41
  orientation: exif_data.orientation_sym,
41
42
  intrinsics: {exif: exif_data},
43
+ content_type: CR2_MIME_TYPE,
42
44
  )
43
45
  rescue EXIFR::MalformedTIFF
44
46
  nil
@@ -6,6 +6,11 @@ class FormatParser::DPXParser
6
6
  BE_MAGIC = 'SDPX'
7
7
  LE_MAGIC = BE_MAGIC.reverse
8
8
 
9
+ # There is no official MIME type for DPX, so we have
10
+ # to invent something useful. We will prefix it with x-
11
+ # to indicate that it is a vendor subtype
12
+ DPX_MIME_TYPE = 'image/x-dpx'
13
+
9
14
  class ByteOrderHintIO < SimpleDelegator
10
15
  def initialize(io, is_little_endian)
11
16
  super(io)
@@ -35,18 +40,23 @@ class FormatParser::DPXParser
35
40
  w = dpx_structure.fetch(:image).fetch(:pixels_per_line)
36
41
  h = dpx_structure.fetch(:image).fetch(:lines_per_element)
37
42
 
43
+ display_w = w
44
+ display_h = h
45
+
38
46
  pixel_aspect_w = dpx_structure.fetch(:orientation).fetch(:horizontal_pixel_aspect)
39
47
  pixel_aspect_h = dpx_structure.fetch(:orientation).fetch(:vertical_pixel_aspect)
40
- pixel_aspect = pixel_aspect_w / pixel_aspect_h.to_f
41
48
 
42
- image_aspect = w / h.to_f * pixel_aspect
49
+ # Find display height and width based on aspect only if the file structure has pixel aspects
50
+ if pixel_aspect_h != 0 && pixel_aspect_w != 0
51
+ pixel_aspect = pixel_aspect_w / pixel_aspect_h.to_f
43
52
 
44
- display_w = w
45
- display_h = h
46
- if image_aspect > 1
47
- display_h = (display_w / image_aspect).round
48
- else
49
- display_w = (display_h * image_aspect).round
53
+ image_aspect = w / h.to_f * pixel_aspect
54
+
55
+ if image_aspect > 1
56
+ display_h = (display_w / image_aspect).round
57
+ else
58
+ display_w = (display_h * image_aspect).round
59
+ end
50
60
  end
51
61
 
52
62
  FormatParser::Image.new(
@@ -56,6 +66,7 @@ class FormatParser::DPXParser
56
66
  display_width_px: display_w,
57
67
  display_height_px: display_h,
58
68
  intrinsics: dpx_structure,
69
+ content_type: DPX_MIME_TYPE,
59
70
  )
60
71
  end
61
72
 
@@ -4,6 +4,7 @@ class FormatParser::FLACParser
4
4
  MAGIC_BYTES = 4
5
5
  MAGIC_BYTE_STRING = 'fLaC'
6
6
  BLOCK_HEADER_BYTES = 4
7
+ FLAC_MIME_TYPE = 'audio/x-flac'
7
8
 
8
9
  def likely_match?(filename)
9
10
  filename =~ /\.flac$/i
@@ -61,6 +62,7 @@ class FormatParser::FLACParser
61
62
  audio_sample_rate_hz: sample_rate,
62
63
  media_duration_seconds: duration,
63
64
  media_duration_frames: total_samples,
65
+ content_type: FLAC_MIME_TYPE,
64
66
  intrinsics: {
65
67
  bits_per_sample: bits_per_sample,
66
68
  minimum_frame_size: minimum_frame_size,
@@ -3,6 +3,7 @@ class FormatParser::GIFParser
3
3
 
4
4
  HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
5
5
  NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
6
+ GIF_MIME_TYPE = 'image/gif'
6
7
 
7
8
  def likely_match?(filename)
8
9
  filename =~ /\.gif$/i
@@ -45,6 +46,7 @@ class FormatParser::GIFParser
45
46
  height_px: h,
46
47
  has_multiple_frames: is_animated,
47
48
  color_mode: :indexed,
49
+ content_type: GIF_MIME_TYPE
48
50
  )
49
51
  end
50
52
 
@@ -12,6 +12,7 @@ class FormatParser::JPEGParser
12
12
  APP1_MARKER = 0xE1 # maybe EXIF
13
13
  EXIF_MAGIC_STRING = "Exif\0\0".b
14
14
  MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
15
+ JPEG_MIME_TYPE = 'image/jpeg'
15
16
 
16
17
  def self.likely_match?(filename)
17
18
  filename =~ /\.jpe?g$/i
@@ -88,6 +89,7 @@ class FormatParser::JPEGParser
88
89
  display_height_px: dh,
89
90
  orientation: flat_exif.orientation_sym,
90
91
  intrinsics: {exif: flat_exif},
92
+ content_type: JPEG_MIME_TYPE
91
93
  )
92
94
 
93
95
  return result
@@ -0,0 +1,23 @@
1
+ class FormatParser::M3UParser
2
+ include FormatParser::IOUtils
3
+
4
+ HEADER = '#EXTM3U'
5
+ M3U8_MIME_TYPE = 'application/vnd.apple.mpegurl' # https://en.wikipedia.org/wiki/M3U#Internet_media_types
6
+
7
+ def likely_match?(filename)
8
+ filename =~ /\.m3u8?$/i
9
+ end
10
+
11
+ def call(io)
12
+ io = FormatParser::IOConstraint.new(io)
13
+
14
+ header = safe_read(io, 7)
15
+ return unless HEADER.eql?(header)
16
+
17
+ FormatParser::Text.new(
18
+ format: :m3u,
19
+ content_type: M3U8_MIME_TYPE,
20
+ )
21
+ end
22
+ FormatParser.register_parser new, natures: :text, formats: :m3u
23
+ end
@@ -11,6 +11,12 @@ class FormatParser::MOOVParser
11
11
  'm4a ' => :m4a,
12
12
  }
13
13
 
14
+ # https://tools.ietf.org/html/rfc4337#section-2
15
+ # There is also video/quicktime which we should be able to capture
16
+ # here, but there is currently no detection for MOVs versus MP4s
17
+ MP4_AU_MIME_TYPE = 'audio/mp4'
18
+ MP4_MIXED_MIME_TYPE = 'video/mp4'
19
+
14
20
  def likely_match?(filename)
15
21
  filename =~ /\.(mov|m4a|ma4|mp4|aac|m4v)$/i
16
22
  end
@@ -49,10 +55,12 @@ class FormatParser::MOOVParser
49
55
  end
50
56
 
51
57
  # M4A only contains audio, while MP4 and friends can contain video.
52
- if format_from_moov_type(file_type) == :m4a
58
+ fmt = format_from_moov_type(file_type)
59
+ if fmt == :m4a
53
60
  FormatParser::Audio.new(
54
61
  format: format_from_moov_type(file_type),
55
62
  media_duration_seconds: media_duration_s,
63
+ content_type: MP4_AU_MIME_TYPE,
56
64
  intrinsics: atom_tree,
57
65
  )
58
66
  else
@@ -61,6 +69,7 @@ class FormatParser::MOOVParser
61
69
  width_px: width,
62
70
  height_px: height,
63
71
  media_duration_seconds: media_duration_s,
72
+ content_type: MP4_MIXED_MIME_TYPE,
64
73
  intrinsics: atom_tree,
65
74
  )
66
75
  end
@@ -29,6 +29,10 @@ class FormatParser::MP3Parser
29
29
  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
30
30
  PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
31
31
 
32
+ MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
33
+ MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
34
+ TIFF_HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
35
+ MP3_MIME_TYPE = 'audio/mpeg'
32
36
  # Wraps the Tag object returned by ID3Tag in such
33
37
  # a way that a usable JSON representation gets
34
38
  # returned
@@ -68,6 +72,9 @@ class FormatParser::MP3Parser
68
72
  return if header.start_with?(ZIP_LOCAL_ENTRY_SIGNATURE)
69
73
  return if header.start_with?(PNG_HEADER_BYTES)
70
74
 
75
+ io.seek(0)
76
+ return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
77
+
71
78
  # Read all the ID3 tags (or at least attempt to)
72
79
  io.seek(0)
73
80
  id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
@@ -97,7 +104,8 @@ class FormatParser::MP3Parser
97
104
  # do not tell anything of substance
98
105
  num_audio_channels: first_frame.channels,
99
106
  audio_sample_rate_hz: first_frame.sample_rate,
100
- intrinsics: id3tags_hash.merge(id3tags: tags)
107
+ intrinsics: id3tags_hash.merge(id3tags: tags),
108
+ content_type: MP3_MIME_TYPE,
101
109
  )
102
110
 
103
111
  extra_file_attirbutes = fetch_extra_attributes_from_id3_tags(id3tags_hash)
@@ -3,8 +3,8 @@
3
3
  class FormatParser::OggParser
4
4
  include FormatParser::IOUtils
5
5
 
6
- # Maximum size of an Ogg page
7
6
  MAX_POSSIBLE_PAGE_SIZE = 65307
7
+ OGG_MIME_TYPE = 'audio/ogg'
8
8
 
9
9
  def likely_match?(filename)
10
10
  filename =~ /\.ogg$/i
@@ -45,7 +45,8 @@ class FormatParser::OggParser
45
45
  format: :ogg,
46
46
  audio_sample_rate_hz: sample_rate,
47
47
  num_audio_channels: channels,
48
- media_duration_seconds: duration
48
+ media_duration_seconds: duration,
49
+ content_type: OGG_MIME_TYPE,
49
50
  )
50
51
  end
51
52
 
@@ -1,6 +1,5 @@
1
1
  class FormatParser::PDFParser
2
2
  include FormatParser::IOUtils
3
-
4
3
  # First 9 bytes of a PDF should be in this format, according to:
5
4
  #
6
5
  # https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
@@ -8,6 +7,7 @@ class FormatParser::PDFParser
8
7
  # There are however exceptions, which are left out for now.
9
8
  #
10
9
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
10
+ PDF_CONTENT_TYPE = 'application/pdf'
11
11
 
12
12
  def likely_match?(filename)
13
13
  filename =~ /\.(pdf|ai)$/i
@@ -18,7 +18,7 @@ class FormatParser::PDFParser
18
18
 
19
19
  return unless safe_read(io, 9) =~ PDF_MARKER
20
20
 
21
- FormatParser::Document.new(format: :pdf)
21
+ FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
22
22
  end
23
23
 
24
24
  FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
@@ -14,6 +14,7 @@ class FormatParser::PNGParser
14
14
  4 => true, # Grayscale with alpha
15
15
  6 => true,
16
16
  }
17
+ PNG_MIME_TYPE = 'image/png'
17
18
 
18
19
  def likely_match?(filename)
19
20
  filename =~ /\.png$/i
@@ -67,6 +68,7 @@ class FormatParser::PNGParser
67
68
  color_mode: color_mode,
68
69
  has_multiple_frames: has_animation,
69
70
  num_animation_or_video_frames: num_frames,
71
+ content_type: PNG_MIME_TYPE,
70
72
  )
71
73
  end
72
74
 
@@ -2,6 +2,7 @@ class FormatParser::PSDParser
2
2
  include FormatParser::IOUtils
3
3
 
4
4
  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
5
+ PSD_MIME_TYPE = 'application/x-photoshop'
5
6
 
6
7
  def likely_match?(filename)
7
8
  filename =~ /\.psd$/i # Maybe also PSB at some point
@@ -20,6 +21,7 @@ class FormatParser::PSDParser
20
21
  format: :psd,
21
22
  width_px: w,
22
23
  height_px: h,
24
+ content_type: PSD_MIME_TYPE,
23
25
  )
24
26
  end
25
27
 
@@ -4,6 +4,9 @@ class FormatParser::TIFFParser
4
4
 
5
5
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
6
6
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
7
+ HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
8
+ TIFF_MIME_TYPE = 'image/tiff'
9
+ ARW_MIME_TYPE = 'image/x-sony-arw'
7
10
 
8
11
  def likely_match?(filename)
9
12
  filename =~ /\.tiff?$/i
@@ -12,8 +15,11 @@ class FormatParser::TIFFParser
12
15
  def call(io)
13
16
  io = FormatParser::IOConstraint.new(io)
14
17
 
15
- return unless [MAGIC_LE, MAGIC_BE].include?(safe_read(io, 4))
16
- io.seek(io.pos + 2) # Skip over the offset of the IFD, EXIFR will re-read it anyway
18
+ return unless HEADER_BYTES.include?(safe_read(io, 4))
19
+
20
+ # Skip over the offset of the IFD,
21
+ # EXIFR will re-read it anyway
22
+ io.seek(io.pos + 2)
17
23
  return if cr2?(io)
18
24
 
19
25
  # The TIFF scanner in EXIFR is plenty good enough,
@@ -25,14 +31,17 @@ class FormatParser::TIFFParser
25
31
  w = exif_data.width || exif_data.pixel_x_dimension
26
32
  h = exif_data.height || exif_data.pixel_y_dimension
27
33
 
34
+ format = arw?(exif_data) ? :arw : :tif
35
+ mime_type = arw?(exif_data) ? ARW_MIME_TYPE : TIFF_MIME_TYPE
28
36
  FormatParser::Image.new(
29
- format: arw?(exif_data) ? :arw : :tif, # Specify format as arw for Sony ARW format images, else tif
37
+ format: format,
30
38
  width_px: w,
31
39
  height_px: h,
32
40
  display_width_px: exif_data.rotated? ? h : w,
33
41
  display_height_px: exif_data.rotated? ? w : h,
34
42
  orientation: exif_data.orientation_sym,
35
43
  intrinsics: {exif: exif_data},
44
+ content_type: mime_type,
36
45
  )
37
46
  rescue EXIFR::MalformedTIFF
38
47
  nil
@@ -1,6 +1,8 @@
1
1
  class FormatParser::WAVParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ WAV_MIME_TYPE = 'audio/x-wav'
5
+
4
6
  def likely_match?(filename)
5
7
  filename =~ /\.wav$/i
6
8
  end
@@ -96,6 +98,7 @@ class FormatParser::WAVParser
96
98
  audio_sample_rate_hz: fmt_data[:sample_rate],
97
99
  media_duration_frames: sample_frames,
98
100
  media_duration_seconds: duration_in_seconds,
101
+ content_type: WAV_MIME_TYPE,
99
102
  )
100
103
  end
101
104
 
@@ -5,6 +5,8 @@ class FormatParser::ZIPParser
5
5
  include OfficeFormats
6
6
  include FormatParser::IOUtils
7
7
 
8
+ ZIP_MIME_TYPE = 'application/zip'
9
+
8
10
  def likely_match?(filename)
9
11
  filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
10
12
  end
@@ -25,10 +27,10 @@ class FormatParser::ZIPParser
25
27
  end
26
28
 
27
29
  if office_document?(filenames_set)
28
- office_format = office_file_format_from_entry_set(filenames_set)
29
- FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
30
+ office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
31
+ FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
30
32
  else
31
- FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
33
+ FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
32
34
  end
33
35
  rescue FileReader::Error
34
36
  # This is not a ZIP, or a broken ZIP.
@@ -37,15 +37,15 @@ module FormatParser::ZIPParser::OfficeFormats
37
37
  OFFICE_MARKER_FILES.subset?(filenames_set)
38
38
  end
39
39
 
40
- def office_file_format_from_entry_set(filenames_set)
40
+ def office_file_format_and_mime_type_from_entry_set(filenames_set)
41
41
  if filenames_set.include?('word/document.xml')
42
- :docx
42
+ [:docx, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
43
43
  elsif filenames_set.include?('xl/workbook.xml')
44
- :xlsx
44
+ [:xlsx, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
45
45
  elsif filenames_set.include?('ppt/presentation.xml')
46
- :pptx
46
+ [:pptx, 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
47
47
  else
48
- :unknown
48
+ [:unknown, 'application/zip']
49
49
  end
50
50
  end
51
51
  end
@@ -0,0 +1,19 @@
1
+ module FormatParser
2
+ class Text
3
+ include FormatParser::AttributesJSON
4
+
5
+ NATURE = :text
6
+
7
+ attr_accessor :format
8
+ attr_accessor :content_type
9
+
10
+ # Only permits assignments via defined accessors
11
+ def initialize(**attributes)
12
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
13
+ end
14
+
15
+ def nature
16
+ NATURE
17
+ end
18
+ end
19
+ end
@@ -23,6 +23,9 @@ module FormatParser
23
23
  # it can be placed here
24
24
  attr_accessor :intrinsics
25
25
 
26
+ # The MIME type of the video
27
+ attr_accessor :content_type
28
+
26
29
  # Only permits assignments via defined accessors
27
30
  def initialize(**attributes)
28
31
  attributes.map { |(k, v)| public_send("#{k}=", v) }
@@ -173,6 +173,26 @@ describe FormatParser do
173
173
  prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
174
174
  expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
175
175
  end
176
+
177
+ it 'sorts the parsers by priority and name' do
178
+ parsers = FormatParser.parsers_for(
179
+ [:audio, :image],
180
+ [:cr2, :dpx, :fdx, :flac, :gif, :jpg, :mov, :mp4, :m4a, :mp3, :mpg, :mpeg, :ogg, :png, :tif, :wav]
181
+ )
182
+
183
+ expect(parsers.map { |parser| parser.class.name }).to eq([
184
+ 'FormatParser::GIFParser',
185
+ 'Class',
186
+ 'FormatParser::PNGParser',
187
+ 'FormatParser::CR2Parser',
188
+ 'FormatParser::DPXParser',
189
+ 'FormatParser::FLACParser',
190
+ 'FormatParser::MP3Parser',
191
+ 'FormatParser::OggParser',
192
+ 'FormatParser::TIFFParser',
193
+ 'FormatParser::WAVParser'
194
+ ])
195
+ end
176
196
  end
177
197
 
178
198
  describe '.register_parser and .deregister_parser' do
@@ -10,6 +10,7 @@ describe FormatParser::AIFFParser do
10
10
  expect(parse_result.num_audio_channels).to eq(2)
11
11
  expect(parse_result.audio_sample_rate_hz).to be_within(0.01).of(44100)
12
12
  expect(parse_result.media_duration_seconds).to be_within(0.01).of(1.05)
13
+ expect(parse_result.content_type).to eq('audio/x-aiff')
13
14
  end
14
15
 
15
16
  it 'parses a Logic Pro created AIFF sample file having a COMT chunk before a COMM chunk' do
@@ -13,6 +13,8 @@ describe FormatParser::BMPParser do
13
13
  expect(parsed.width_px).to eq(40)
14
14
  expect(parsed.height_px).to eq(27)
15
15
 
16
+ expect(parsed.content_type).to eq('image/bmp')
17
+
16
18
  expect(parsed.intrinsics).not_to be_nil
17
19
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2834)
18
20
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2834)
@@ -32,6 +34,8 @@ describe FormatParser::BMPParser do
32
34
  expect(parsed.width_px).to eq(1920)
33
35
  expect(parsed.height_px).to eq(1080)
34
36
 
37
+ expect(parsed.content_type).to eq('image/bmp')
38
+
35
39
  expect(parsed.intrinsics).not_to be_nil
36
40
  expect(parsed.intrinsics[:vertical_resolution]).to eq(2835)
37
41
  expect(parsed.intrinsics[:horizontal_resolution]).to eq(2835)
@@ -51,6 +55,8 @@ describe FormatParser::BMPParser do
51
55
  expect(parsed.width_px).to eq(200)
52
56
  expect(parsed.height_px).to eq(200)
53
57
 
58
+ expect(parsed.content_type).to eq('image/bmp')
59
+
54
60
  expect(parsed.intrinsics).not_to be_nil
55
61
  end
56
62
 
@@ -64,6 +70,7 @@ describe FormatParser::BMPParser do
64
70
  expect(parsed.color_mode).to eq(:rgb)
65
71
  expect(parsed.width_px).to eq(40)
66
72
  expect(parsed.height_px).to eq(27)
73
+ expect(parsed.content_type).to eq('image/bmp')
67
74
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
68
75
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
69
76
 
@@ -76,6 +83,7 @@ describe FormatParser::BMPParser do
76
83
  expect(parsed.color_mode).to eq(:rgb)
77
84
  expect(parsed.width_px).to eq(40)
78
85
  expect(parsed.height_px).to eq(27)
86
+ expect(parsed.content_type).to eq('image/bmp')
79
87
  expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
80
88
  expect(parsed.intrinsics[:data_order]).to eq(:normal)
81
89
  end
@@ -17,6 +17,7 @@ describe FormatParser::CR2Parser do
17
17
  expect(parsed.height_px).to be > 0
18
18
 
19
19
  expect(parsed.orientation).not_to be_nil
20
+ expect(parsed.content_type).to eq('image/x-canon-cr2')
20
21
  end
21
22
  end
22
23
  end
@@ -15,6 +15,7 @@ describe FormatParser::DPXParser do
15
15
  expect(parsed.width_px).to be_between(0, 2048)
16
16
  expect(parsed.height_px).to be_kind_of(Integer)
17
17
  expect(parsed.height_px).to be_between(0, 4000)
18
+ expect(parsed.content_type).to eq('image/x-dpx')
18
19
  end
19
20
  end
20
21
 
@@ -14,6 +14,7 @@ describe FormatParser::FLACParser do
14
14
  expect(parsed.intrinsics).not_to be_nil
15
15
  expect(parsed.media_duration_frames).to eq(33810)
16
16
  expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
17
+ expect(parsed.content_type).to eq('audio/x-flac')
17
18
  end
18
19
 
19
20
  it 'decodes and estimates duration for the 16bit FLAC File' do
@@ -17,6 +17,7 @@ describe FormatParser::GIFParser do
17
17
 
18
18
  expect(parsed.height_px).to be_kind_of(Integer)
19
19
  expect(parsed.height_px).to be > 0
20
+ expect(parsed.content_type).to eq('image/gif')
20
21
  end
21
22
  end
22
23
  end
@@ -14,6 +14,7 @@ describe FormatParser::JPEGParser do
14
14
 
15
15
  expect(parsed.height_px).to be_kind_of(Integer)
16
16
  expect(parsed.height_px).to be > 0
17
+ expect(parsed.content_type).to eq('image/jpeg')
17
18
  end
18
19
  end
19
20
  end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::M3UParser do
4
+ let(:parsed_m3u) do
5
+ subject.call(
6
+ File.open(
7
+ Pathname.new(fixtures_dir).join('M3U').join(m3u_file),
8
+ 'rb'
9
+ )
10
+ )
11
+ end
12
+
13
+ describe 'an m3u file with missing header' do
14
+ let(:m3u_file) { 'plain_text.m3u' }
15
+
16
+ it 'does not parse the file successfully' do
17
+ expect(parsed_m3u).to be_nil
18
+ end
19
+ end
20
+
21
+ describe 'an m3u file with valid header' do
22
+ let(:m3u_file) { 'sample.m3u' }
23
+
24
+ it 'parses the file successfully' do
25
+ expect(parsed_m3u).not_to be_nil
26
+ expect(parsed_m3u.nature).to eq(:text)
27
+ expect(parsed_m3u.format).to eq(:m3u)
28
+ expect(parsed_m3u.content_type).to eq('application/vnd.apple.mpegurl')
29
+ end
30
+ end
31
+
32
+ describe 'an m3u8 file with valid header' do
33
+ let(:m3u_file) { 'sample.m3u8' }
34
+
35
+ it 'parses the file successfully' do
36
+ expect(parsed_m3u).not_to be_nil
37
+ expect(parsed_m3u.nature).to eq(:text)
38
+ expect(parsed_m3u.format).to eq(:m3u)
39
+ end
40
+ end
41
+ end
@@ -37,7 +37,7 @@ describe FormatParser::MOOVParser do
37
37
  expect(result.nature).to eq(:audio)
38
38
  expect(result.media_duration_seconds).to be_kind_of(Float)
39
39
  expect(result.media_duration_seconds).to be > 0
40
-
40
+ expect(result.content_type).to be_kind_of(String)
41
41
  expect(result.intrinsics).not_to be_nil
42
42
  end
43
43
  end
@@ -52,6 +52,7 @@ describe FormatParser::MOOVParser do
52
52
  expect(result.height_px).to be > 0
53
53
  expect(result.media_duration_seconds).to be_kind_of(Float)
54
54
  expect(result.media_duration_seconds).to be > 0
55
+ expect(result.content_type).to eq('video/mp4')
55
56
 
56
57
  expect(result.intrinsics).not_to be_nil
57
58
  end
@@ -67,6 +68,7 @@ describe FormatParser::MOOVParser do
67
68
  expect(result.height_px).to be > 0
68
69
  expect(result.media_duration_seconds).to be_kind_of(Float)
69
70
  expect(result.media_duration_seconds).to be > 0
71
+ expect(result.content_type).to eq('video/mp4')
70
72
 
71
73
  expect(result.intrinsics).not_to be_nil
72
74
  end
@@ -79,6 +81,7 @@ describe FormatParser::MOOVParser do
79
81
  expect(result).not_to be_nil
80
82
  expect(result.nature).to eq(:audio)
81
83
  expect(result.format).to eq(:m4a)
84
+ expect(result.content_type).to eq('audio/mp4')
82
85
  end
83
86
 
84
87
  it 'parses a MOV file and provides the necessary metadata' do
@@ -23,6 +23,7 @@ describe FormatParser::MP3Parser do
23
23
 
24
24
  expect(parsed.nature).to eq(:audio)
25
25
  expect(parsed.format).to eq(:mp3)
26
+ expect(parsed.content_type).to eq('audio/mpeg')
26
27
  expect(parsed.num_audio_channels).to eq(2)
27
28
  expect(parsed.audio_sample_rate_hz).to eq(48000)
28
29
  expect(parsed.intrinsics).not_to be_nil
@@ -205,4 +206,12 @@ describe FormatParser::MP3Parser do
205
206
  ).to eq([ID3Tag::Tag])
206
207
  end
207
208
  end
209
+
210
+ it 'does not recognize TIFF files as MP3' do
211
+ fpath = fixtures_dir + '/TIFF/test2.tif'
212
+
213
+ parsed = subject.call(File.open(fpath, 'rb'))
214
+
215
+ expect(parsed).to be_nil
216
+ end
208
217
  end
@@ -6,6 +6,7 @@ describe FormatParser::OggParser do
6
6
 
7
7
  expect(parse_result.nature).to eq(:audio)
8
8
  expect(parse_result.format).to eq(:ogg)
9
+ expect(parse_result.content_type).to eq('audio/ogg')
9
10
  expect(parse_result.num_audio_channels).to eq(1)
10
11
  expect(parse_result.audio_sample_rate_hz).to eq(16000)
11
12
  expect(parse_result.media_duration_seconds).to be_within(0.01).of(2973.95)
@@ -17,6 +17,7 @@ describe FormatParser::PDFParser do
17
17
  expect(parsed_pdf).not_to be_nil
18
18
  expect(parsed_pdf.nature).to eq(:document)
19
19
  expect(parsed_pdf.format).to eq(:pdf)
20
+ expect(parsed_pdf.content_type).to eq('application/pdf')
20
21
  end
21
22
  end
22
23
 
@@ -15,6 +15,7 @@ describe FormatParser::PNGParser do
15
15
 
16
16
  expect(parsed.height_px).to be_kind_of(Integer)
17
17
  expect(parsed.height_px).to be > 0
18
+ expect(parsed.content_type).to eq('image/png')
18
19
  end
19
20
  end
20
21
  end
@@ -15,6 +15,7 @@ describe FormatParser::PSDParser do
15
15
 
16
16
  expect(parsed.height_px).to be_kind_of(Integer)
17
17
  expect(parsed.height_px).to be > 0
18
+ expect(parsed.content_type).to eq('application/x-photoshop')
18
19
  end
19
20
  end
20
21
  end
@@ -59,6 +59,7 @@ describe FormatParser::TIFFParser do
59
59
  expect(parsed.width_px).to eq(7952)
60
60
  expect(parsed.height_px).to eq(5304)
61
61
  expect(parsed.intrinsics[:exif]).not_to be_nil
62
+ expect(parsed.content_type).to eq('image/x-sony-arw')
62
63
  end
63
64
 
64
65
  describe 'correctly extracts dimensions from various TIFF flavors of the same file' do
@@ -9,6 +9,7 @@ describe FormatParser::WAVParser do
9
9
 
10
10
  expect(parse_result.nature).to eq(:audio)
11
11
  expect(parse_result.format).to eq(:wav)
12
+ expect(parse_result.content_type).to eq('audio/x-wav')
12
13
  end
13
14
  end
14
15
 
@@ -14,6 +14,7 @@ describe FormatParser::ZIPParser do
14
14
  expect(result).not_to be_nil
15
15
 
16
16
  expect(result.format).to eq(:zip)
17
+ expect(result.content_type).to eq('application/zip')
17
18
  expect(result.nature).to eq(:archive)
18
19
  expect(result.entries.length).to eq(0xFFFF + 1)
19
20
 
@@ -58,6 +59,7 @@ describe FormatParser::ZIPParser do
58
59
  result = subject.call(fi_io)
59
60
  expect(result.nature).to eq(:document)
60
61
  expect(result.format).to eq(:docx)
62
+ expect(result.content_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
61
63
 
62
64
  fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
63
65
  fi_io = File.open(fixture_path, 'rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.25.3
4
+ version: 0.27.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2020-10-08 00:00:00.000000000 Z
12
+ date: 2021-01-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -219,6 +219,7 @@ files:
219
219
  - lib/parsers/flac_parser.rb
220
220
  - lib/parsers/gif_parser.rb
221
221
  - lib/parsers/jpeg_parser.rb
222
+ - lib/parsers/m3u_parser.rb
222
223
  - lib/parsers/moov_parser.rb
223
224
  - lib/parsers/moov_parser/decoder.rb
224
225
  - lib/parsers/mp3_parser.rb
@@ -236,6 +237,7 @@ files:
236
237
  - lib/read_limiter.rb
237
238
  - lib/read_limits_config.rb
238
239
  - lib/remote_io.rb
240
+ - lib/text.rb
239
241
  - lib/video.rb
240
242
  - spec/active_storage/blob_io_spec.rb
241
243
  - spec/active_storage/rails_app_spec.rb
@@ -257,6 +259,7 @@ files:
257
259
  - spec/parsers/flac_parser_spec.rb
258
260
  - spec/parsers/gif_parser_spec.rb
259
261
  - spec/parsers/jpeg_parser_spec.rb
262
+ - spec/parsers/m3u_parser_spec.rb
260
263
  - spec/parsers/moov_parser_spec.rb
261
264
  - spec/parsers/mp3_parser_spec.rb
262
265
  - spec/parsers/mpeg_parser_spec.rb
@@ -292,7 +295,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
295
  - !ruby/object:Gem::Version
293
296
  version: '0'
294
297
  requirements: []
295
- rubygems_version: 3.1.2
298
+ rubygems_version: 3.0.3
296
299
  signing_key:
297
300
  specification_version: 4
298
301
  summary: A library for efficient parsing of file metadata