format_parser 0.12.4 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
4
- data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
3
+ metadata.gz: f2bd20132739f75daaae94963ac56837e4852af4bd891d0b74abfef85e110eae
4
+ data.tar.gz: 442e4d4a2cdbe5c5fa1fd46b12e218f33c1e52d4706d94d751800a12913aa6ae
5
5
  SHA512:
6
- metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
7
- data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
6
+ metadata.gz: b51a7884bb770ec2bc4d49c7583d9215acf162258368ab557d2da2475083d9ba85e7c00c34506f8cc01c812051189b3cc3e30aaea8c919bc0b7166f255947b96
7
+ data.tar.gz: ba4a3377d265d6f9d1a5887bd6de7cdc83a92c39dbfcb04c97182be265514ffc3b2d36bad1938dc0385d264602fbd197c5e8c7321cc7ad097ddb2303a612d022
@@ -1,3 +1,7 @@
1
+ ## 0.13.0
2
+ * Replace the homegrown ID3 parser with [id3tag](https://github.com/krists/id3tag) - this introduces id3tag
3
+ as a dependency in addition to `exifr`, but the gains are substantial.
4
+
1
5
  ## 0.12.4
2
6
  * Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
3
7
  the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_dependency 'ks', '~> 0.0.1'
34
34
  spec.add_dependency 'exifr', '~> 1.0'
35
+ spec.add_dependency 'id3tag', '~> 0.10'
35
36
  spec.add_dependency 'faraday', '~> 0.13'
36
37
 
37
38
  spec.add_development_dependency 'rspec', '~> 3.0'
@@ -11,7 +11,6 @@
11
11
  # the_foo.number_of_bars = 42
12
12
  # the_foo.as_json #=> {:number_of_bars => 42}
13
13
  module FormatParser::AttributesJSON
14
- UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
15
14
  MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
16
15
 
17
16
  # Implements a sane default `as_json` for an object
@@ -49,7 +48,7 @@ module FormatParser::AttributesJSON
49
48
  when Float::INFINITY
50
49
  nil
51
50
  when String
52
- value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
51
+ FormatParser.string_to_lossy_utf8(value)
53
52
  when Hash
54
53
  Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
55
54
  when Array
@@ -243,6 +243,11 @@ module FormatParser
243
243
  end
244
244
  end
245
245
 
246
+ def self.string_to_lossy_utf8(str)
247
+ replacement_char = [0xFFFD].pack('U')
248
+ str.encode(Encoding::UTF_8, undef: :replace, replace: replacement_char)
249
+ end
250
+
246
251
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
247
252
  require parser_file
248
253
  end
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.12.4'
2
+ VERSION = '0.13.0'
3
3
  end
@@ -1,8 +1,8 @@
1
1
  require 'ks'
2
+ require 'id3tag'
2
3
 
3
4
  class FormatParser::MP3Parser
4
- require_relative 'mp3_parser/id3_v1'
5
- require_relative 'mp3_parser/id3_v2'
5
+ require_relative 'mp3_parser/id3_extraction'
6
6
 
7
7
  class MPEGFrame < Ks.strict(:offset_in_file, :mpeg_id, :channels, :sample_rate, :frame_length, :frame_bitrate)
8
8
  end
@@ -26,21 +26,50 @@ class FormatParser::MP3Parser
26
26
  # For some edge cases
27
27
  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
28
28
 
29
- def call(io)
29
+ # Wraps the Tag object returned by ID3Tag in such
30
+ # a way that a usable JSON representation gets
31
+ # returned
32
+ class TagWrapper < SimpleDelegator
33
+ include FormatParser::AttributesJSON
34
+
35
+ MEMBERS = [:artist, :title, :album, :year, :track_nr, :genre, :comments, :unsychronized_transcription]
36
+
37
+ def self.new(wrapped)
38
+ wrapped ? super : nil
39
+ end
40
+
41
+ def to_h
42
+ tag = __getobj__
43
+ MEMBERS.each_with_object({}) do |k, h|
44
+ # ID3Tag sometimes raises when trying to find an unknown genre.
45
+ # If this guard is removed, it fails when trying to do a gsub on a nil,
46
+ # in /lib/id3tag/frames/v2/genre_frame/genre_parser_pre_24.rb:25:in `just_genres'
47
+ value = tag.public_send(k) rescue nil
48
+ h[k] = value if value
49
+ end
50
+ end
51
+
52
+ def as_json(*)
53
+ to_h
54
+ end
55
+ end
56
+
57
+ def call(raw_io)
58
+ io = FormatParser::IOConstraint.new(raw_io)
59
+
30
60
  # Special case: some ZIPs (Office documents) did detect as MP3s.
31
61
  # To avoid having that happen, we check for the PKZIP signature -
32
62
  # local entry header signature - at the very start of the file
33
63
  return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
34
64
  io.seek(0)
35
65
 
36
- # Read the last 128 bytes which might contain ID3v1
37
- id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
38
- # Read the header bytes that might contain ID3v1
39
- id3_v2 = ID3V2.attempt_id3_v2_extraction(io)
66
+ # Read all the ID3 tags (or at least attempt to)
67
+ id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
68
+ tags = [id3v1, ID3Extraction.attempt_id3_v2_extraction(io)].compact
40
69
 
41
70
  # Compute how many bytes are occupied by the actual MPEG frames
42
- ignore_bytes_at_tail = id3_v1 ? 128 : 0
43
- ignore_bytes_at_head = id3_v2 ? io.pos : 0
71
+ ignore_bytes_at_tail = id3v1 ? 128 : 0
72
+ ignore_bytes_at_head = io.pos
44
73
  bytes_used_by_frames = io.size - ignore_bytes_at_tail - ignore_bytes_at_tail
45
74
 
46
75
  io.seek(ignore_bytes_at_head)
@@ -53,17 +82,12 @@ class FormatParser::MP3Parser
53
82
 
54
83
  file_info = FormatParser::Audio.new(
55
84
  format: :mp3,
56
- num_audio_channels: first_frame.channels,
57
- audio_sample_rate_hz: first_frame.sample_rate,
58
85
  # media_duration_frames is omitted because the frames
59
86
  # in MPEG are not the same thing as in a movie file - they
60
87
  # do not tell anything of substance
61
- intrinsics: {
62
- id3_v1: id3_v1 ? id3_v1.to_h : nil,
63
- id3_v2: id3_v2 ? id3_v2.map(&:to_h) : nil,
64
- xing_header: maybe_xing_header.to_h,
65
- initial_frames: initial_frames.map(&:to_h)
66
- }
88
+ num_audio_channels: first_frame.channels,
89
+ audio_sample_rate_hz: first_frame.sample_rate,
90
+ intrinsics: blend_id3_tags_into_hash(*tags).merge(id3tags: tags)
67
91
  )
68
92
 
69
93
  if maybe_xing_header
@@ -244,5 +268,11 @@ class FormatParser::MP3Parser
244
268
  raise InvalidDeepFetch, "Could not retrieve #{keys.inspect} from #{from.inspect}"
245
269
  end
246
270
 
271
+ def blend_id3_tags_into_hash(*tags)
272
+ tags.each_with_object({}) do |tag, h|
273
+ h.merge!(TagWrapper.new(tag).to_h)
274
+ end
275
+ end
276
+
247
277
  FormatParser.register_parser self, natures: :audio, formats: :mp3
248
278
  end
@@ -0,0 +1,76 @@
1
+ module FormatParser::MP3Parser::ID3Extraction
2
+ ID3V1_TAG_SIZE_BYTES = 128
3
+ ID3V2_TAG_VERSIONS = ["\x43\x00".b, "\x03\x00".b, "\x02\x00".b]
4
+ MAX_SIZE_FOR_ID3V2 = 1 * 1024 * 1024
5
+
6
+ extend FormatParser::IOUtils
7
+
8
+ def attempt_id3_v1_extraction(io)
9
+ return if io.size < ID3V1_TAG_SIZE_BYTES # Won't fit the ID3v1 regardless
10
+
11
+ io.seek(io.size - 128)
12
+ trailer_bytes = io.read(128)
13
+
14
+ return unless trailer_bytes && trailer_bytes.bytesize == ID3V1_TAG_SIZE_BYTES
15
+ return unless trailer_bytes.byteslice(0, 3) == 'TAG'
16
+
17
+ buf = StringIO.new(trailer_bytes)
18
+ swallow_exceptions { ID3Tag.read(buf, :v1) }
19
+ end
20
+
21
+ def attempt_id3_v2_extraction(io)
22
+ io.seek(0) # Only support header ID3v2
23
+ header = parse_id3_v2_header(io)
24
+ return unless header[:tag] == 'ID3' && header[:size] > 0
25
+ return unless ID3V2_TAG_VERSIONS.include?(header[:version])
26
+
27
+ id3_tag_size = io.pos + header[:size]
28
+
29
+ # Here we got to pay attention. The tag size encoded in
30
+ # the ID3 header is a 4-byte unsigned int. Meaning it
31
+ # can hold values up to 256 MB. We do not want to read
32
+ # that much since we are pulling that data into memory -
33
+ # and it would also make the parser easily exploitable.
34
+ # We will set a "hard" limit beyound which we will simply
35
+ # refuse to read those tags at all.
36
+ if id3_tag_size > MAX_SIZE_FOR_ID3V2
37
+ io.seek(id3_tag_size) # For reading the frames
38
+ return
39
+ end
40
+
41
+ io.seek(0)
42
+ blob = safe_read(io, id3_tag_size)
43
+
44
+ swallow_exceptions { ID3Tag.read(StringIO.new(blob), :v2) }
45
+ rescue FormatParser::IOUtils::InvalidRead
46
+ nil
47
+ end
48
+
49
+ def read_and_unpack_packspec(io, **packspec)
50
+ sizes = {'a' => 1, 'N' => 4}
51
+ n = packspec.values.map { |e| sizes.fetch(e[0]) * e[1].to_i }.inject(&:+)
52
+ byte_str = safe_read(io, n)
53
+
54
+ unpacked_values = byte_str.unpack(packspec.values.join)
55
+ Hash[packspec.keys.zip(unpacked_values)]
56
+ end
57
+
58
+ def parse_id3_v2_header(io)
59
+ fields = {tag: :a3, version: :a2, flags: :a1, syncsafe_size: :N1}
60
+ header_data = read_and_unpack_packspec(io, **fields)
61
+ header_data[:size] = ID3Tag::SynchsafeInteger.decode(header_data.delete(:syncsafe_size))
62
+ header_data
63
+ end
64
+
65
+ # We swallow exceptions from ID3Tag primarily because it does not have
66
+ # a single wrapping error class we could capture. We also do not touch our original
67
+ # IO object when working with ID3Tag
68
+ def swallow_exceptions
69
+ yield
70
+ rescue => e
71
+ warn(e)
72
+ nil
73
+ end
74
+
75
+ extend self
76
+ end
@@ -1,5 +1,4 @@
1
1
  class FormatParser::ZIPParser
2
- UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
3
2
  require_relative 'zip_parser/file_reader'
4
3
  require_relative 'zip_parser/office_formats'
5
4
 
@@ -40,7 +39,7 @@ class FormatParser::ZIPParser
40
39
 
41
40
  def decode_filename(filename, likely_unicode:)
42
41
  filename.force_encoding(Encoding::UTF_8) if likely_unicode
43
- filename.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
42
+ FormatParser.string_to_lossy_utf8(filename)
44
43
  end
45
44
 
46
45
  def decode_filename_of(zip_entry)
@@ -29,6 +29,24 @@ describe FormatParser::MP3Parser do
29
29
  expect(parsed.media_duration_seconds).to be_within(0.1).of(0.81)
30
30
  end
31
31
 
32
+ it 'does not attempt to read ID3V2 tags that are too large' do
33
+ more_bytes_than_permitted = 3 * 1024 * 1024
34
+ gunk = Random.new.bytes(more_bytes_than_permitted)
35
+
36
+ large_syncsfe_size = [ID3Tag::SynchsafeInteger.encode(more_bytes_than_permitted)].pack('N')
37
+ prepped = StringIO.new(
38
+ 'ID3' + "\x43\x00".b + "\x00".b + large_syncsfe_size + gunk
39
+ )
40
+
41
+ expect(ID3Tag).not_to receive(:read)
42
+
43
+ prepped.seek(0)
44
+ result = FormatParser::MP3Parser::ID3Extraction.attempt_id3_v2_extraction(prepped)
45
+
46
+ expect(result).to be_nil
47
+ expect(prepped.pos).to eq(3145738)
48
+ end
49
+
32
50
  it 'parses the Cassy MP3' do
33
51
  fpath = fixtures_dir + '/MP3/Cassy.mp3'
34
52
  parsed = subject.call(File.open(fpath, 'rb'))
@@ -39,10 +57,21 @@ describe FormatParser::MP3Parser do
39
57
  expect(parsed.format).to eq(:mp3)
40
58
  expect(parsed.num_audio_channels).to eq(2)
41
59
  expect(parsed.audio_sample_rate_hz).to eq(44100)
42
- expect(parsed.intrinsics).not_to be_nil
43
60
  expect(parsed.media_duration_seconds).to be_within(0.1).of(1102.46)
44
61
 
45
62
  expect(parsed.intrinsics).not_to be_nil
63
+
64
+ i = parsed.intrinsics
65
+ expect(i[:artist]).to eq('WeTransfer Studios/GIlles Peterson')
66
+ expect(i[:title]).to eq('Cassy')
67
+ expect(i[:album]).to eq('The Psychology of DJing')
68
+ expect(i[:comments]).to eq('0')
69
+ expect(i[:id3tags]).not_to be_nil
70
+
71
+ expect(parsed.intrinsics).not_to be_nil
72
+
73
+ # Make sure we are good with our JSON representation as well
74
+ JSON.pretty_generate(parsed)
46
75
  end
47
76
 
48
77
  it 'avoids returning a result when the parsed duration is infinite' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.4
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -39,6 +39,20 @@ dependencies:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
41
  version: '1.0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: id3tag
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.10'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.10'
42
56
  - !ruby/object:Gem::Dependency
43
57
  name: faraday
44
58
  requirement: !ruby/object:Gem::Requirement
@@ -186,8 +200,7 @@ files:
186
200
  - lib/parsers/moov_parser.rb
187
201
  - lib/parsers/moov_parser/decoder.rb
188
202
  - lib/parsers/mp3_parser.rb
189
- - lib/parsers/mp3_parser/id3_v1.rb
190
- - lib/parsers/mp3_parser/id3_v2.rb
203
+ - lib/parsers/mp3_parser/id3_extraction.rb
191
204
  - lib/parsers/pdf_parser.rb
192
205
  - lib/parsers/png_parser.rb
193
206
  - lib/parsers/psd_parser.rb
@@ -1,48 +0,0 @@
1
- module FormatParser::MP3Parser::ID3V1
2
- PACKSPEC = [
3
- :tag, :a3,
4
- :song_name, :a30,
5
- :artist, :a30,
6
- :album, :a30,
7
- :year, :N1,
8
- :comment, :a30,
9
- :genre, :C,
10
- ]
11
- packspec_keys = PACKSPEC.select.with_index { |_, i| i.even? }
12
- TAG_SIZE_BYTES = 128
13
-
14
- class TagInformation < Struct.new(*packspec_keys)
15
- end
16
-
17
- def attempt_id3_v1_extraction(io)
18
- return if io.size < TAG_SIZE_BYTES # Won't fit the ID3v1 regardless
19
-
20
- io.seek(io.size - 128)
21
- trailer_bytes = io.read(128)
22
-
23
- return unless trailer_bytes && trailer_bytes.byteslice(0, 3) == 'TAG'
24
-
25
- id3_v1 = parse_id3_v1(trailer_bytes)
26
-
27
- # If all of the resulting strings are empty this ID3v1 tag is invalid and
28
- # we should ignore it.
29
- strings_from_id3v1 = id3_v1.values.select { |e| e.is_a?(String) && e != 'TAG' }
30
- return if strings_from_id3v1.all?(&:empty?)
31
-
32
- id3_v1
33
- end
34
-
35
- def parse_id3_v1(byte_str)
36
- _keys, values = PACKSPEC.partition.with_index { |_, i| i.even? }
37
- unpacked_values = byte_str.unpack(values.join)
38
- unpacked_values.map! { |e| e.is_a?(String) ? trim_id3v1_string(e) : e }
39
- TagInformation.new(unpacked_values)
40
- end
41
-
42
- # Remove trailing whitespace and trailing nullbytes
43
- def trim_id3v1_string(str)
44
- str.tr("\x00".b, '').strip
45
- end
46
-
47
- extend self
48
- end
@@ -1,84 +0,0 @@
1
- module FormatParser::MP3Parser::ID3V2
2
- def attempt_id3_v2_extraction(io)
3
- io.seek(0) # Only support header ID3v2
4
- header_bytes = io.read(10)
5
- return unless header_bytes
6
-
7
- header = parse_id3_v2_header(header_bytes)
8
- return unless header[:tag] == 'ID3'
9
- return unless header[:size] > 0
10
-
11
- header_tag_payload = io.read(header[:size])
12
- header_tag_payload = StringIO.new(header_tag_payload)
13
-
14
- return unless header_tag_payload.size == header[:size]
15
-
16
- frames = []
17
- loop do
18
- break if header_tag_payload.eof?
19
- frame = parse_id3_v2_frame(header_tag_payload)
20
- # Some files include padding, which is there so that when you edit ID3v2
21
- # you do not have to overwrite the entire file - you can use this padding to
22
- # add some more tags or to grow the existing ones. In practice if we hit
23
- # something with a type of "0x00000000" we have entered the padding zone and
24
- # there is no point in parsing further
25
- if frame[:id] == "\x00\x00\x00\x00".b
26
- break
27
- else
28
- frames << frame
29
- end
30
- end
31
- frames
32
- end
33
-
34
- def parse_id3_v2_header(byte_str)
35
- packspec = [
36
- :tag, :a3,
37
- :version, :a2,
38
- :flags, :C1,
39
- :size, :a4,
40
- ]
41
- keys, values = packspec.partition.with_index { |_, i| i.even? }
42
- unpacked_values = byte_str.unpack(values.join)
43
- header_data = Hash[keys.zip(unpacked_values)]
44
-
45
- header_data[:version] = header_data[:version].unpack('C2')
46
- header_data[:size] = decode_syncsafe_int(header_data[:size])
47
-
48
- header_data
49
- end
50
-
51
- def parse_id3_v2_frame(io)
52
- id, syncsafe_size, flags = io.read(10).unpack('a4a4a2')
53
- size = decode_syncsafe_int(syncsafe_size)
54
- content = io.read(size)
55
- # It might so happen in sutations of terrible invalidity that we end up
56
- # with less data than advertised by the syncsafe size. We will just truck on.
57
- {id: id, size: size, flags: flags, content: content}
58
- end
59
-
60
- # ID3v2 uses "unsynchronized integers", which are unsigned integers smeared
61
- # over multiple bytes in such a manner that the first bit is always 0 (unset).
62
- # This is done so that ID3v2 incompatible decoders will not by accident see
63
- # the 0xFF0xFF0xFF0xFF sequence anywhere that can be mistaken for the MPEG frame
64
- # synchronisation header. Effectively it is a 7 bit big-endian unsigned integer
65
- # encoding.
66
- #
67
- # 8 bit 255 (0xFF) encoded in this mannner takes 16 bits instead,
68
- # and looks like this: `0b00000001 01111111`. Note how it avoids having
69
- # the first bit of the second byte be 1.
70
- # This method decodes an unsigned integer packed in this fashion
71
- def decode_syncsafe_int(bytes)
72
- size = 0
73
- j = 0
74
- i = bytes.bytesize - 1
75
- while i >= 0
76
- size += 128**i * (bytes.getbyte(j) & 0x7f)
77
- j += 1
78
- i -= 1
79
- end
80
- size
81
- end
82
-
83
- extend self
84
- end