format_parser 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/lib/archive.rb +3 -0
- data/lib/audio.rb +3 -0
- data/lib/document.rb +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/image.rb +3 -0
- data/lib/parsers/aiff_parser.rb +4 -1
- data/lib/parsers/bmp_parser.rb +3 -0
- data/lib/parsers/cr2_parser.rb +2 -0
- data/lib/parsers/dpx_parser.rb +6 -0
- data/lib/parsers/flac_parser.rb +2 -0
- data/lib/parsers/gif_parser.rb +2 -0
- data/lib/parsers/jpeg_parser.rb +2 -0
- data/lib/parsers/m3u_parser.rb +3 -1
- data/lib/parsers/moov_parser.rb +10 -1
- data/lib/parsers/mp3_parser.rb +3 -2
- data/lib/parsers/ogg_parser.rb +3 -2
- data/lib/parsers/pdf_parser.rb +2 -2
- data/lib/parsers/png_parser.rb +2 -0
- data/lib/parsers/psd_parser.rb +2 -0
- data/lib/parsers/tiff_parser.rb +10 -2
- data/lib/parsers/wav_parser.rb +3 -0
- data/lib/parsers/zip_parser.rb +5 -3
- data/lib/parsers/zip_parser/office_formats.rb +5 -5
- data/lib/text.rb +1 -0
- data/lib/video.rb +3 -0
- data/spec/parsers/aiff_parser_spec.rb +1 -0
- data/spec/parsers/bmp_parser_spec.rb +8 -0
- data/spec/parsers/cr2_parser_spec.rb +1 -0
- data/spec/parsers/dpx_parser_spec.rb +1 -0
- data/spec/parsers/flac_parser_spec.rb +1 -0
- data/spec/parsers/gif_parser_spec.rb +1 -0
- data/spec/parsers/jpeg_parser_spec.rb +1 -0
- data/spec/parsers/m3u_parser_spec.rb +1 -0
- data/spec/parsers/moov_parser_spec.rb +4 -1
- data/spec/parsers/mp3_parser_spec.rb +1 -0
- data/spec/parsers/ogg_parser_spec.rb +1 -0
- data/spec/parsers/pdf_parser_spec.rb +1 -0
- data/spec/parsers/png_parser_spec.rb +1 -0
- data/spec/parsers/psd_parser_spec.rb +1 -0
- data/spec/parsers/tiff_parser_spec.rb +1 -0
- data/spec/parsers/wav_parser_spec.rb +1 -0
- data/spec/parsers/zip_parser_spec.rb +2 -0
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a10ceeaca4d0d6d2336b94f9fc397781ae2ffabdb588cee7ebc59fdcb968082
|
4
|
+
data.tar.gz: c28b8b7a0eb1d83e9f93406a4bbbad0699e50248d49d413393d6c3a6d82f7acf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b3780e0f615f8a42aa097b652b675ceba5fb2325c3f2ad7472e178204cef3838967ea56c25e0daa1f5c666df42c5f5d4252a4dd42dae26389c41288d5b56d30
|
7
|
+
data.tar.gz: f6f22e664f8603e691795b44902cc5d677f63a02b4950233ea09719b9b2e8ae7d9eea57e406bd4f5b9e252aff8e9c5a684cdb1155846f0eedeceeba7fe7131c0
|
data/CHANGELOG.md
CHANGED
data/lib/archive.rb
CHANGED
@@ -26,6 +26,9 @@ module FormatParser
|
|
26
26
|
# it can be placed here
|
27
27
|
attr_accessor :intrinsics
|
28
28
|
|
29
|
+
# The MIME type of the archive
|
30
|
+
attr_accessor :content_type
|
31
|
+
|
29
32
|
# Only permits assignments via defined accessors
|
30
33
|
def initialize(**attributes)
|
31
34
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
data/lib/audio.rb
CHANGED
@@ -35,6 +35,9 @@ module FormatParser
|
|
35
35
|
# it can be placed here
|
36
36
|
attr_accessor :intrinsics
|
37
37
|
|
38
|
+
# The MIME type of the sound file
|
39
|
+
attr_accessor :content_type
|
40
|
+
|
38
41
|
# Only permits assignments via defined accessors
|
39
42
|
def initialize(**attributes)
|
40
43
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
data/lib/document.rb
CHANGED
data/lib/image.rb
CHANGED
@@ -64,6 +64,9 @@ module FormatParser
|
|
64
64
|
# it can be placed here
|
65
65
|
attr_accessor :intrinsics
|
66
66
|
|
67
|
+
# The MIME type of the image file
|
68
|
+
attr_accessor :content_type
|
69
|
+
|
67
70
|
# Only permits assignments via defined accessors
|
68
71
|
def initialize(**attributes)
|
69
72
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
data/lib/parsers/aiff_parser.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
class FormatParser::AIFFParser
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
|
+
AIFF_MIME_TYPE = 'audio/x-aiff'
|
5
|
+
|
4
6
|
# Known chunk types we can omit when parsing,
|
5
7
|
# grossly lifted from http://www.muratnkonar.com/aiff/
|
6
8
|
KNOWN_CHUNKS = [
|
@@ -70,7 +72,8 @@ class FormatParser::AIFFParser
|
|
70
72
|
num_audio_channels: channels,
|
71
73
|
audio_sample_rate_hz: sample_rate.to_i,
|
72
74
|
media_duration_frames: sample_frames,
|
73
|
-
media_duration_seconds: duration_in_seconds
|
75
|
+
media_duration_seconds: duration_in_seconds,
|
76
|
+
content_type: AIFF_MIME_TYPE,
|
74
77
|
)
|
75
78
|
end
|
76
79
|
|
data/lib/parsers/bmp_parser.rb
CHANGED
@@ -5,6 +5,7 @@ class FormatParser::BMPParser
|
|
5
5
|
|
6
6
|
VALID_BMP = 'BM'
|
7
7
|
PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 26..512
|
8
|
+
BMP_MIME_TYPE = 'image/bmp'
|
8
9
|
|
9
10
|
def likely_match?(filename)
|
10
11
|
filename =~ /\.bmp$/i
|
@@ -42,6 +43,7 @@ class FormatParser::BMPParser
|
|
42
43
|
width_px: width,
|
43
44
|
height_px: height,
|
44
45
|
color_mode: :rgb,
|
46
|
+
content_type: BMP_MIME_TYPE,
|
45
47
|
intrinsics: {
|
46
48
|
data_order: data_order,
|
47
49
|
bits_per_pixel: bit_depth
|
@@ -63,6 +65,7 @@ class FormatParser::BMPParser
|
|
63
65
|
width_px: width,
|
64
66
|
height_px: height.abs,
|
65
67
|
color_mode: :rgb,
|
68
|
+
content_type: BMP_MIME_TYPE,
|
66
69
|
intrinsics: {
|
67
70
|
vertical_resolution: vertical_res,
|
68
71
|
horizontal_resolution: horizontal_res,
|
data/lib/parsers/cr2_parser.rb
CHANGED
@@ -6,6 +6,7 @@ class FormatParser::CR2Parser
|
|
6
6
|
|
7
7
|
TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
|
8
8
|
CR2_HEADER = [0x43, 0x52, 0x02, 0x00]
|
9
|
+
CR2_MIME_TYPE = 'image/x-canon-cr2'
|
9
10
|
|
10
11
|
def likely_match?(filename)
|
11
12
|
filename =~ /\.cr2$/i
|
@@ -39,6 +40,7 @@ class FormatParser::CR2Parser
|
|
39
40
|
display_height_px: exif_data.rotated? ? w : h,
|
40
41
|
orientation: exif_data.orientation_sym,
|
41
42
|
intrinsics: {exif: exif_data},
|
43
|
+
content_type: CR2_MIME_TYPE,
|
42
44
|
)
|
43
45
|
rescue EXIFR::MalformedTIFF
|
44
46
|
nil
|
data/lib/parsers/dpx_parser.rb
CHANGED
@@ -6,6 +6,11 @@ class FormatParser::DPXParser
|
|
6
6
|
BE_MAGIC = 'SDPX'
|
7
7
|
LE_MAGIC = BE_MAGIC.reverse
|
8
8
|
|
9
|
+
# There is no official MIME type for DPX, so we have
|
10
|
+
# to invent something useful. We will prefix it with x-
|
11
|
+
# to indicate that it is a vendor subtype
|
12
|
+
DPX_MIME_TYPE = 'image/x-dpx'
|
13
|
+
|
9
14
|
class ByteOrderHintIO < SimpleDelegator
|
10
15
|
def initialize(io, is_little_endian)
|
11
16
|
super(io)
|
@@ -61,6 +66,7 @@ class FormatParser::DPXParser
|
|
61
66
|
display_width_px: display_w,
|
62
67
|
display_height_px: display_h,
|
63
68
|
intrinsics: dpx_structure,
|
69
|
+
content_type: DPX_MIME_TYPE,
|
64
70
|
)
|
65
71
|
end
|
66
72
|
|
data/lib/parsers/flac_parser.rb
CHANGED
@@ -4,6 +4,7 @@ class FormatParser::FLACParser
|
|
4
4
|
MAGIC_BYTES = 4
|
5
5
|
MAGIC_BYTE_STRING = 'fLaC'
|
6
6
|
BLOCK_HEADER_BYTES = 4
|
7
|
+
FLAC_MIME_TYPE = 'audio/x-flac'
|
7
8
|
|
8
9
|
def likely_match?(filename)
|
9
10
|
filename =~ /\.flac$/i
|
@@ -61,6 +62,7 @@ class FormatParser::FLACParser
|
|
61
62
|
audio_sample_rate_hz: sample_rate,
|
62
63
|
media_duration_seconds: duration,
|
63
64
|
media_duration_frames: total_samples,
|
65
|
+
content_type: FLAC_MIME_TYPE,
|
64
66
|
intrinsics: {
|
65
67
|
bits_per_sample: bits_per_sample,
|
66
68
|
minimum_frame_size: minimum_frame_size,
|
data/lib/parsers/gif_parser.rb
CHANGED
@@ -3,6 +3,7 @@ class FormatParser::GIFParser
|
|
3
3
|
|
4
4
|
HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
|
5
5
|
NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
|
6
|
+
GIF_MIME_TYPE = 'image/gif'
|
6
7
|
|
7
8
|
def likely_match?(filename)
|
8
9
|
filename =~ /\.gif$/i
|
@@ -45,6 +46,7 @@ class FormatParser::GIFParser
|
|
45
46
|
height_px: h,
|
46
47
|
has_multiple_frames: is_animated,
|
47
48
|
color_mode: :indexed,
|
49
|
+
content_type: GIF_MIME_TYPE
|
48
50
|
)
|
49
51
|
end
|
50
52
|
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -12,6 +12,7 @@ class FormatParser::JPEGParser
|
|
12
12
|
APP1_MARKER = 0xE1 # maybe EXIF
|
13
13
|
EXIF_MAGIC_STRING = "Exif\0\0".b
|
14
14
|
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
|
15
|
+
JPEG_MIME_TYPE = 'image/jpeg'
|
15
16
|
|
16
17
|
def self.likely_match?(filename)
|
17
18
|
filename =~ /\.jpe?g$/i
|
@@ -88,6 +89,7 @@ class FormatParser::JPEGParser
|
|
88
89
|
display_height_px: dh,
|
89
90
|
orientation: flat_exif.orientation_sym,
|
90
91
|
intrinsics: {exif: flat_exif},
|
92
|
+
content_type: JPEG_MIME_TYPE
|
91
93
|
)
|
92
94
|
|
93
95
|
return result
|
data/lib/parsers/m3u_parser.rb
CHANGED
@@ -2,6 +2,7 @@ class FormatParser::M3UParser
|
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
4
|
HEADER = '#EXTM3U'
|
5
|
+
M3U8_MIME_TYPE = 'application/vnd.apple.mpegurl' # https://en.wikipedia.org/wiki/M3U#Internet_media_types
|
5
6
|
|
6
7
|
def likely_match?(filename)
|
7
8
|
filename =~ /\.m3u8?$/i
|
@@ -14,7 +15,8 @@ class FormatParser::M3UParser
|
|
14
15
|
return unless HEADER.eql?(header)
|
15
16
|
|
16
17
|
FormatParser::Text.new(
|
17
|
-
format: :m3u
|
18
|
+
format: :m3u,
|
19
|
+
content_type: M3U8_MIME_TYPE,
|
18
20
|
)
|
19
21
|
end
|
20
22
|
FormatParser.register_parser new, natures: :text, formats: :m3u
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -11,6 +11,12 @@ class FormatParser::MOOVParser
|
|
11
11
|
'm4a ' => :m4a,
|
12
12
|
}
|
13
13
|
|
14
|
+
# https://tools.ietf.org/html/rfc4337#section-2
|
15
|
+
# There is also video/quicktime which we should be able to capture
|
16
|
+
# here, but there is currently no detection for MOVs versus MP4s
|
17
|
+
MP4_AU_MIME_TYPE = 'audio/mp4'
|
18
|
+
MP4_MIXED_MIME_TYPE = 'video/mp4'
|
19
|
+
|
14
20
|
def likely_match?(filename)
|
15
21
|
filename =~ /\.(mov|m4a|ma4|mp4|aac|m4v)$/i
|
16
22
|
end
|
@@ -49,10 +55,12 @@ class FormatParser::MOOVParser
|
|
49
55
|
end
|
50
56
|
|
51
57
|
# M4A only contains audio, while MP4 and friends can contain video.
|
52
|
-
|
58
|
+
fmt = format_from_moov_type(file_type)
|
59
|
+
if fmt == :m4a
|
53
60
|
FormatParser::Audio.new(
|
54
61
|
format: format_from_moov_type(file_type),
|
55
62
|
media_duration_seconds: media_duration_s,
|
63
|
+
content_type: MP4_AU_MIME_TYPE,
|
56
64
|
intrinsics: atom_tree,
|
57
65
|
)
|
58
66
|
else
|
@@ -61,6 +69,7 @@ class FormatParser::MOOVParser
|
|
61
69
|
width_px: width,
|
62
70
|
height_px: height,
|
63
71
|
media_duration_seconds: media_duration_s,
|
72
|
+
content_type: MP4_MIXED_MIME_TYPE,
|
64
73
|
intrinsics: atom_tree,
|
65
74
|
)
|
66
75
|
end
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -32,7 +32,7 @@ class FormatParser::MP3Parser
|
|
32
32
|
MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
|
33
33
|
MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
|
34
34
|
TIFF_HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
|
35
|
-
|
35
|
+
MP3_MIME_TYPE = 'audio/mpeg'
|
36
36
|
# Wraps the Tag object returned by ID3Tag in such
|
37
37
|
# a way that a usable JSON representation gets
|
38
38
|
# returned
|
@@ -104,7 +104,8 @@ class FormatParser::MP3Parser
|
|
104
104
|
# do not tell anything of substance
|
105
105
|
num_audio_channels: first_frame.channels,
|
106
106
|
audio_sample_rate_hz: first_frame.sample_rate,
|
107
|
-
intrinsics: id3tags_hash.merge(id3tags: tags)
|
107
|
+
intrinsics: id3tags_hash.merge(id3tags: tags),
|
108
|
+
content_type: MP3_MIME_TYPE,
|
108
109
|
)
|
109
110
|
|
110
111
|
extra_file_attirbutes = fetch_extra_attributes_from_id3_tags(id3tags_hash)
|
data/lib/parsers/ogg_parser.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
class FormatParser::OggParser
|
4
4
|
include FormatParser::IOUtils
|
5
5
|
|
6
|
-
# Maximum size of an Ogg page
|
7
6
|
MAX_POSSIBLE_PAGE_SIZE = 65307
|
7
|
+
OGG_MIME_TYPE = 'audio/ogg'
|
8
8
|
|
9
9
|
def likely_match?(filename)
|
10
10
|
filename =~ /\.ogg$/i
|
@@ -45,7 +45,8 @@ class FormatParser::OggParser
|
|
45
45
|
format: :ogg,
|
46
46
|
audio_sample_rate_hz: sample_rate,
|
47
47
|
num_audio_channels: channels,
|
48
|
-
media_duration_seconds: duration
|
48
|
+
media_duration_seconds: duration,
|
49
|
+
content_type: OGG_MIME_TYPE,
|
49
50
|
)
|
50
51
|
end
|
51
52
|
|
data/lib/parsers/pdf_parser.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
class FormatParser::PDFParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
-
|
4
3
|
# First 9 bytes of a PDF should be in this format, according to:
|
5
4
|
#
|
6
5
|
# https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
|
@@ -8,6 +7,7 @@ class FormatParser::PDFParser
|
|
8
7
|
# There are however exceptions, which are left out for now.
|
9
8
|
#
|
10
9
|
PDF_MARKER = /%PDF-1\.[0-8]{1}/
|
10
|
+
PDF_CONTENT_TYPE = 'application/pdf'
|
11
11
|
|
12
12
|
def likely_match?(filename)
|
13
13
|
filename =~ /\.(pdf|ai)$/i
|
@@ -18,7 +18,7 @@ class FormatParser::PDFParser
|
|
18
18
|
|
19
19
|
return unless safe_read(io, 9) =~ PDF_MARKER
|
20
20
|
|
21
|
-
FormatParser::Document.new(format: :pdf)
|
21
|
+
FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
|
22
22
|
end
|
23
23
|
|
24
24
|
FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
|
data/lib/parsers/png_parser.rb
CHANGED
@@ -14,6 +14,7 @@ class FormatParser::PNGParser
|
|
14
14
|
4 => true, # Grayscale with alpha
|
15
15
|
6 => true,
|
16
16
|
}
|
17
|
+
PNG_MIME_TYPE = 'image/png'
|
17
18
|
|
18
19
|
def likely_match?(filename)
|
19
20
|
filename =~ /\.png$/i
|
@@ -67,6 +68,7 @@ class FormatParser::PNGParser
|
|
67
68
|
color_mode: color_mode,
|
68
69
|
has_multiple_frames: has_animation,
|
69
70
|
num_animation_or_video_frames: num_frames,
|
71
|
+
content_type: PNG_MIME_TYPE,
|
70
72
|
)
|
71
73
|
end
|
72
74
|
|
data/lib/parsers/psd_parser.rb
CHANGED
@@ -2,6 +2,7 @@ class FormatParser::PSDParser
|
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
4
|
PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
|
5
|
+
PSD_MIME_TYPE = 'application/x-photoshop'
|
5
6
|
|
6
7
|
def likely_match?(filename)
|
7
8
|
filename =~ /\.psd$/i # Maybe also PSB at some point
|
@@ -20,6 +21,7 @@ class FormatParser::PSDParser
|
|
20
21
|
format: :psd,
|
21
22
|
width_px: w,
|
22
23
|
height_px: h,
|
24
|
+
content_type: PSD_MIME_TYPE,
|
23
25
|
)
|
24
26
|
end
|
25
27
|
|
data/lib/parsers/tiff_parser.rb
CHANGED
@@ -5,6 +5,8 @@ class FormatParser::TIFFParser
|
|
5
5
|
MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
|
6
6
|
MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
|
7
7
|
HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
|
8
|
+
TIFF_MIME_TYPE = 'image/tiff'
|
9
|
+
ARW_MIME_TYPE = 'image/x-sony-arw'
|
8
10
|
|
9
11
|
def likely_match?(filename)
|
10
12
|
filename =~ /\.tiff?$/i
|
@@ -14,7 +16,10 @@ class FormatParser::TIFFParser
|
|
14
16
|
io = FormatParser::IOConstraint.new(io)
|
15
17
|
|
16
18
|
return unless HEADER_BYTES.include?(safe_read(io, 4))
|
17
|
-
|
19
|
+
|
20
|
+
# Skip over the offset of the IFD,
|
21
|
+
# EXIFR will re-read it anyway
|
22
|
+
io.seek(io.pos + 2)
|
18
23
|
return if cr2?(io)
|
19
24
|
|
20
25
|
# The TIFF scanner in EXIFR is plenty good enough,
|
@@ -26,14 +31,17 @@ class FormatParser::TIFFParser
|
|
26
31
|
w = exif_data.width || exif_data.pixel_x_dimension
|
27
32
|
h = exif_data.height || exif_data.pixel_y_dimension
|
28
33
|
|
34
|
+
format = arw?(exif_data) ? :arw : :tif
|
35
|
+
mime_type = arw?(exif_data) ? ARW_MIME_TYPE : TIFF_MIME_TYPE
|
29
36
|
FormatParser::Image.new(
|
30
|
-
format:
|
37
|
+
format: format,
|
31
38
|
width_px: w,
|
32
39
|
height_px: h,
|
33
40
|
display_width_px: exif_data.rotated? ? h : w,
|
34
41
|
display_height_px: exif_data.rotated? ? w : h,
|
35
42
|
orientation: exif_data.orientation_sym,
|
36
43
|
intrinsics: {exif: exif_data},
|
44
|
+
content_type: mime_type,
|
37
45
|
)
|
38
46
|
rescue EXIFR::MalformedTIFF
|
39
47
|
nil
|
data/lib/parsers/wav_parser.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
class FormatParser::WAVParser
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
|
+
WAV_MIME_TYPE = 'audio/x-wav'
|
5
|
+
|
4
6
|
def likely_match?(filename)
|
5
7
|
filename =~ /\.wav$/i
|
6
8
|
end
|
@@ -96,6 +98,7 @@ class FormatParser::WAVParser
|
|
96
98
|
audio_sample_rate_hz: fmt_data[:sample_rate],
|
97
99
|
media_duration_frames: sample_frames,
|
98
100
|
media_duration_seconds: duration_in_seconds,
|
101
|
+
content_type: WAV_MIME_TYPE,
|
99
102
|
)
|
100
103
|
end
|
101
104
|
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -5,6 +5,8 @@ class FormatParser::ZIPParser
|
|
5
5
|
include OfficeFormats
|
6
6
|
include FormatParser::IOUtils
|
7
7
|
|
8
|
+
ZIP_MIME_TYPE = 'application/zip'
|
9
|
+
|
8
10
|
def likely_match?(filename)
|
9
11
|
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
|
10
12
|
end
|
@@ -25,10 +27,10 @@ class FormatParser::ZIPParser
|
|
25
27
|
end
|
26
28
|
|
27
29
|
if office_document?(filenames_set)
|
28
|
-
office_format =
|
29
|
-
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
|
30
|
+
office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
|
31
|
+
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
|
30
32
|
else
|
31
|
-
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
|
33
|
+
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
|
32
34
|
end
|
33
35
|
rescue FileReader::Error
|
34
36
|
# This is not a ZIP, or a broken ZIP.
|
@@ -37,15 +37,15 @@ module FormatParser::ZIPParser::OfficeFormats
|
|
37
37
|
OFFICE_MARKER_FILES.subset?(filenames_set)
|
38
38
|
end
|
39
39
|
|
40
|
-
def
|
40
|
+
def office_file_format_and_mime_type_from_entry_set(filenames_set)
|
41
41
|
if filenames_set.include?('word/document.xml')
|
42
|
-
:docx
|
42
|
+
[:docx, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
43
43
|
elsif filenames_set.include?('xl/workbook.xml')
|
44
|
-
:xlsx
|
44
|
+
[:xlsx, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
|
45
45
|
elsif filenames_set.include?('ppt/presentation.xml')
|
46
|
-
:pptx
|
46
|
+
[:pptx, 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
|
47
47
|
else
|
48
|
-
:unknown
|
48
|
+
[:unknown, 'application/zip']
|
49
49
|
end
|
50
50
|
end
|
51
51
|
end
|
data/lib/text.rb
CHANGED
data/lib/video.rb
CHANGED
@@ -23,6 +23,9 @@ module FormatParser
|
|
23
23
|
# it can be placed here
|
24
24
|
attr_accessor :intrinsics
|
25
25
|
|
26
|
+
# The MIME type of the video
|
27
|
+
attr_accessor :content_type
|
28
|
+
|
26
29
|
# Only permits assignments via defined accessors
|
27
30
|
def initialize(**attributes)
|
28
31
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
@@ -10,6 +10,7 @@ describe FormatParser::AIFFParser do
|
|
10
10
|
expect(parse_result.num_audio_channels).to eq(2)
|
11
11
|
expect(parse_result.audio_sample_rate_hz).to be_within(0.01).of(44100)
|
12
12
|
expect(parse_result.media_duration_seconds).to be_within(0.01).of(1.05)
|
13
|
+
expect(parse_result.content_type).to eq('audio/x-aiff')
|
13
14
|
end
|
14
15
|
|
15
16
|
it 'parses a Logic Pro created AIFF sample file having a COMT chunk before a COMM chunk' do
|
@@ -13,6 +13,8 @@ describe FormatParser::BMPParser do
|
|
13
13
|
expect(parsed.width_px).to eq(40)
|
14
14
|
expect(parsed.height_px).to eq(27)
|
15
15
|
|
16
|
+
expect(parsed.content_type).to eq('image/bmp')
|
17
|
+
|
16
18
|
expect(parsed.intrinsics).not_to be_nil
|
17
19
|
expect(parsed.intrinsics[:vertical_resolution]).to eq(2834)
|
18
20
|
expect(parsed.intrinsics[:horizontal_resolution]).to eq(2834)
|
@@ -32,6 +34,8 @@ describe FormatParser::BMPParser do
|
|
32
34
|
expect(parsed.width_px).to eq(1920)
|
33
35
|
expect(parsed.height_px).to eq(1080)
|
34
36
|
|
37
|
+
expect(parsed.content_type).to eq('image/bmp')
|
38
|
+
|
35
39
|
expect(parsed.intrinsics).not_to be_nil
|
36
40
|
expect(parsed.intrinsics[:vertical_resolution]).to eq(2835)
|
37
41
|
expect(parsed.intrinsics[:horizontal_resolution]).to eq(2835)
|
@@ -51,6 +55,8 @@ describe FormatParser::BMPParser do
|
|
51
55
|
expect(parsed.width_px).to eq(200)
|
52
56
|
expect(parsed.height_px).to eq(200)
|
53
57
|
|
58
|
+
expect(parsed.content_type).to eq('image/bmp')
|
59
|
+
|
54
60
|
expect(parsed.intrinsics).not_to be_nil
|
55
61
|
end
|
56
62
|
|
@@ -64,6 +70,7 @@ describe FormatParser::BMPParser do
|
|
64
70
|
expect(parsed.color_mode).to eq(:rgb)
|
65
71
|
expect(parsed.width_px).to eq(40)
|
66
72
|
expect(parsed.height_px).to eq(27)
|
73
|
+
expect(parsed.content_type).to eq('image/bmp')
|
67
74
|
expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
|
68
75
|
expect(parsed.intrinsics[:data_order]).to eq(:normal)
|
69
76
|
|
@@ -76,6 +83,7 @@ describe FormatParser::BMPParser do
|
|
76
83
|
expect(parsed.color_mode).to eq(:rgb)
|
77
84
|
expect(parsed.width_px).to eq(40)
|
78
85
|
expect(parsed.height_px).to eq(27)
|
86
|
+
expect(parsed.content_type).to eq('image/bmp')
|
79
87
|
expect(parsed.intrinsics[:bits_per_pixel]).to eq(24)
|
80
88
|
expect(parsed.intrinsics[:data_order]).to eq(:normal)
|
81
89
|
end
|
@@ -14,6 +14,7 @@ describe FormatParser::FLACParser do
|
|
14
14
|
expect(parsed.intrinsics).not_to be_nil
|
15
15
|
expect(parsed.media_duration_frames).to eq(33810)
|
16
16
|
expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
|
17
|
+
expect(parsed.content_type).to eq('audio/x-flac')
|
17
18
|
end
|
18
19
|
|
19
20
|
it 'decodes and estimates duration for the 16bit FLAC File' do
|
@@ -37,7 +37,7 @@ describe FormatParser::MOOVParser do
|
|
37
37
|
expect(result.nature).to eq(:audio)
|
38
38
|
expect(result.media_duration_seconds).to be_kind_of(Float)
|
39
39
|
expect(result.media_duration_seconds).to be > 0
|
40
|
-
|
40
|
+
expect(result.content_type).to be_kind_of(String)
|
41
41
|
expect(result.intrinsics).not_to be_nil
|
42
42
|
end
|
43
43
|
end
|
@@ -52,6 +52,7 @@ describe FormatParser::MOOVParser do
|
|
52
52
|
expect(result.height_px).to be > 0
|
53
53
|
expect(result.media_duration_seconds).to be_kind_of(Float)
|
54
54
|
expect(result.media_duration_seconds).to be > 0
|
55
|
+
expect(result.content_type).to eq('video/mp4')
|
55
56
|
|
56
57
|
expect(result.intrinsics).not_to be_nil
|
57
58
|
end
|
@@ -67,6 +68,7 @@ describe FormatParser::MOOVParser do
|
|
67
68
|
expect(result.height_px).to be > 0
|
68
69
|
expect(result.media_duration_seconds).to be_kind_of(Float)
|
69
70
|
expect(result.media_duration_seconds).to be > 0
|
71
|
+
expect(result.content_type).to eq('video/mp4')
|
70
72
|
|
71
73
|
expect(result.intrinsics).not_to be_nil
|
72
74
|
end
|
@@ -79,6 +81,7 @@ describe FormatParser::MOOVParser do
|
|
79
81
|
expect(result).not_to be_nil
|
80
82
|
expect(result.nature).to eq(:audio)
|
81
83
|
expect(result.format).to eq(:m4a)
|
84
|
+
expect(result.content_type).to eq('audio/mp4')
|
82
85
|
end
|
83
86
|
|
84
87
|
it 'parses a MOV file and provides the necessary metadata' do
|
@@ -23,6 +23,7 @@ describe FormatParser::MP3Parser do
|
|
23
23
|
|
24
24
|
expect(parsed.nature).to eq(:audio)
|
25
25
|
expect(parsed.format).to eq(:mp3)
|
26
|
+
expect(parsed.content_type).to eq('audio/mpeg')
|
26
27
|
expect(parsed.num_audio_channels).to eq(2)
|
27
28
|
expect(parsed.audio_sample_rate_hz).to eq(48000)
|
28
29
|
expect(parsed.intrinsics).not_to be_nil
|
@@ -6,6 +6,7 @@ describe FormatParser::OggParser do
|
|
6
6
|
|
7
7
|
expect(parse_result.nature).to eq(:audio)
|
8
8
|
expect(parse_result.format).to eq(:ogg)
|
9
|
+
expect(parse_result.content_type).to eq('audio/ogg')
|
9
10
|
expect(parse_result.num_audio_channels).to eq(1)
|
10
11
|
expect(parse_result.audio_sample_rate_hz).to eq(16000)
|
11
12
|
expect(parse_result.media_duration_seconds).to be_within(0.01).of(2973.95)
|
@@ -59,6 +59,7 @@ describe FormatParser::TIFFParser do
|
|
59
59
|
expect(parsed.width_px).to eq(7952)
|
60
60
|
expect(parsed.height_px).to eq(5304)
|
61
61
|
expect(parsed.intrinsics[:exif]).not_to be_nil
|
62
|
+
expect(parsed.content_type).to eq('image/x-sony-arw')
|
62
63
|
end
|
63
64
|
|
64
65
|
describe 'correctly extracts dimensions from various TIFF flavors of the same file' do
|
@@ -14,6 +14,7 @@ describe FormatParser::ZIPParser do
|
|
14
14
|
expect(result).not_to be_nil
|
15
15
|
|
16
16
|
expect(result.format).to eq(:zip)
|
17
|
+
expect(result.content_type).to eq('application/zip')
|
17
18
|
expect(result.nature).to eq(:archive)
|
18
19
|
expect(result.entries.length).to eq(0xFFFF + 1)
|
19
20
|
|
@@ -58,6 +59,7 @@ describe FormatParser::ZIPParser do
|
|
58
59
|
result = subject.call(fi_io)
|
59
60
|
expect(result.nature).to eq(:document)
|
60
61
|
expect(result.format).to eq(:docx)
|
62
|
+
expect(result.content_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
61
63
|
|
62
64
|
fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
|
63
65
|
fi_io = File.open(fixture_path, 'rb')
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.27.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
8
8
|
- Julik Tarkhanov
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-01-
|
12
|
+
date: 2021-01-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -280,7 +280,7 @@ licenses:
|
|
280
280
|
- MIT (Hippocratic)
|
281
281
|
metadata:
|
282
282
|
allowed_push_host: https://rubygems.org
|
283
|
-
post_install_message:
|
283
|
+
post_install_message:
|
284
284
|
rdoc_options: []
|
285
285
|
require_paths:
|
286
286
|
- lib
|
@@ -296,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
296
296
|
version: '0'
|
297
297
|
requirements: []
|
298
298
|
rubygems_version: 3.0.3
|
299
|
-
signing_key:
|
299
|
+
signing_key:
|
300
300
|
specification_version: 4
|
301
301
|
summary: A library for efficient parsing of file metadata
|
302
302
|
test_files: []
|