format_parser 0.12.4 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
4
- data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
3
+ metadata.gz: f2bd20132739f75daaae94963ac56837e4852af4bd891d0b74abfef85e110eae
4
+ data.tar.gz: 442e4d4a2cdbe5c5fa1fd46b12e218f33c1e52d4706d94d751800a12913aa6ae
5
5
  SHA512:
6
- metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
7
- data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
6
+ metadata.gz: b51a7884bb770ec2bc4d49c7583d9215acf162258368ab557d2da2475083d9ba85e7c00c34506f8cc01c812051189b3cc3e30aaea8c919bc0b7166f255947b96
7
+ data.tar.gz: ba4a3377d265d6f9d1a5887bd6de7cdc83a92c39dbfcb04c97182be265514ffc3b2d36bad1938dc0385d264602fbd197c5e8c7321cc7ad097ddb2303a612d022
@@ -1,3 +1,7 @@
1
+ ## 0.13.0
2
+ * Replace the homegrown ID3 parser with [id3tag](https://github.com/krists/id3tag) - this introduces id3tag
3
+ as a dependency in addition to `exifr`, but the gains are substantial.
4
+
1
5
  ## 0.12.4
2
6
  * Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
3
7
  the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_dependency 'ks', '~> 0.0.1'
34
34
  spec.add_dependency 'exifr', '~> 1.0'
35
+ spec.add_dependency 'id3tag', '~> 0.10'
35
36
  spec.add_dependency 'faraday', '~> 0.13'
36
37
 
37
38
  spec.add_development_dependency 'rspec', '~> 3.0'
@@ -11,7 +11,6 @@
11
11
  # the_foo.number_of_bars = 42
12
12
  # the_foo.as_json #=> {:number_of_bars => 42}
13
13
  module FormatParser::AttributesJSON
14
- UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
15
14
  MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
16
15
 
17
16
  # Implements a sane default `as_json` for an object
@@ -49,7 +48,7 @@ module FormatParser::AttributesJSON
49
48
  when Float::INFINITY
50
49
  nil
51
50
  when String
52
- value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
51
+ FormatParser.string_to_lossy_utf8(value)
53
52
  when Hash
54
53
  Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
55
54
  when Array
@@ -243,6 +243,11 @@ module FormatParser
243
243
  end
244
244
  end
245
245
 
246
+ def self.string_to_lossy_utf8(str)
247
+ replacement_char = [0xFFFD].pack('U')
248
+ str.encode(Encoding::UTF_8, undef: :replace, replace: replacement_char)
249
+ end
250
+
246
251
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
247
252
  require parser_file
248
253
  end
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.12.4'
2
+ VERSION = '0.13.0'
3
3
  end
@@ -1,8 +1,8 @@
1
1
  require 'ks'
2
+ require 'id3tag'
2
3
 
3
4
  class FormatParser::MP3Parser
4
- require_relative 'mp3_parser/id3_v1'
5
- require_relative 'mp3_parser/id3_v2'
5
+ require_relative 'mp3_parser/id3_extraction'
6
6
 
7
7
  class MPEGFrame < Ks.strict(:offset_in_file, :mpeg_id, :channels, :sample_rate, :frame_length, :frame_bitrate)
8
8
  end
@@ -26,21 +26,50 @@ class FormatParser::MP3Parser
26
26
  # For some edge cases
27
27
  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
28
28
 
29
- def call(io)
29
+ # Wraps the Tag object returned by ID3Tag in such
30
+ # a way that a usable JSON representation gets
31
+ # returned
32
+ class TagWrapper < SimpleDelegator
33
+ include FormatParser::AttributesJSON
34
+
35
+ MEMBERS = [:artist, :title, :album, :year, :track_nr, :genre, :comments, :unsychronized_transcription]
36
+
37
+ def self.new(wrapped)
38
+ wrapped ? super : nil
39
+ end
40
+
41
+ def to_h
42
+ tag = __getobj__
43
+ MEMBERS.each_with_object({}) do |k, h|
44
+ # ID3Tag sometimes raises when trying to find an unknown genre.
45
+ # If this guard is removed, it fails when trying to do a gsub on a nil,
46
+ # in /lib/id3tag/frames/v2/genre_frame/genre_parser_pre_24.rb:25:in `just_genres'
47
+ value = tag.public_send(k) rescue nil
48
+ h[k] = value if value
49
+ end
50
+ end
51
+
52
+ def as_json(*)
53
+ to_h
54
+ end
55
+ end
56
+
57
+ def call(raw_io)
58
+ io = FormatParser::IOConstraint.new(raw_io)
59
+
30
60
  # Special case: some ZIPs (Office documents) did detect as MP3s.
31
61
  # To avoid having that happen, we check for the PKZIP signature -
32
62
  # local entry header signature - at the very start of the file
33
63
  return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
34
64
  io.seek(0)
35
65
 
36
- # Read the last 128 bytes which might contain ID3v1
37
- id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
38
- # Read the header bytes that might contain ID3v1
39
- id3_v2 = ID3V2.attempt_id3_v2_extraction(io)
66
+ # Read all the ID3 tags (or at least attempt to)
67
+ id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
68
+ tags = [id3v1, ID3Extraction.attempt_id3_v2_extraction(io)].compact
40
69
 
41
70
  # Compute how many bytes are occupied by the actual MPEG frames
42
- ignore_bytes_at_tail = id3_v1 ? 128 : 0
43
- ignore_bytes_at_head = id3_v2 ? io.pos : 0
71
+ ignore_bytes_at_tail = id3v1 ? 128 : 0
72
+ ignore_bytes_at_head = io.pos
44
73
  bytes_used_by_frames = io.size - ignore_bytes_at_tail - ignore_bytes_at_tail
45
74
 
46
75
  io.seek(ignore_bytes_at_head)
@@ -53,17 +82,12 @@ class FormatParser::MP3Parser
53
82
 
54
83
  file_info = FormatParser::Audio.new(
55
84
  format: :mp3,
56
- num_audio_channels: first_frame.channels,
57
- audio_sample_rate_hz: first_frame.sample_rate,
58
85
  # media_duration_frames is omitted because the frames
59
86
  # in MPEG are not the same thing as in a movie file - they
60
87
  # do not tell anything of substance
61
- intrinsics: {
62
- id3_v1: id3_v1 ? id3_v1.to_h : nil,
63
- id3_v2: id3_v2 ? id3_v2.map(&:to_h) : nil,
64
- xing_header: maybe_xing_header.to_h,
65
- initial_frames: initial_frames.map(&:to_h)
66
- }
88
+ num_audio_channels: first_frame.channels,
89
+ audio_sample_rate_hz: first_frame.sample_rate,
90
+ intrinsics: blend_id3_tags_into_hash(*tags).merge(id3tags: tags)
67
91
  )
68
92
 
69
93
  if maybe_xing_header
@@ -244,5 +268,11 @@ class FormatParser::MP3Parser
244
268
  raise InvalidDeepFetch, "Could not retrieve #{keys.inspect} from #{from.inspect}"
245
269
  end
246
270
 
271
+ def blend_id3_tags_into_hash(*tags)
272
+ tags.each_with_object({}) do |tag, h|
273
+ h.merge!(TagWrapper.new(tag).to_h)
274
+ end
275
+ end
276
+
247
277
  FormatParser.register_parser self, natures: :audio, formats: :mp3
248
278
  end
@@ -0,0 +1,76 @@
1
+ module FormatParser::MP3Parser::ID3Extraction
2
+ ID3V1_TAG_SIZE_BYTES = 128
3
+ ID3V2_TAG_VERSIONS = ["\x43\x00".b, "\x03\x00".b, "\x02\x00".b]
4
+ MAX_SIZE_FOR_ID3V2 = 1 * 1024 * 1024
5
+
6
+ extend FormatParser::IOUtils
7
+
8
+ def attempt_id3_v1_extraction(io)
9
+ return if io.size < ID3V1_TAG_SIZE_BYTES # Won't fit the ID3v1 regardless
10
+
11
+ io.seek(io.size - 128)
12
+ trailer_bytes = io.read(128)
13
+
14
+ return unless trailer_bytes && trailer_bytes.bytesize == ID3V1_TAG_SIZE_BYTES
15
+ return unless trailer_bytes.byteslice(0, 3) == 'TAG'
16
+
17
+ buf = StringIO.new(trailer_bytes)
18
+ swallow_exceptions { ID3Tag.read(buf, :v1) }
19
+ end
20
+
21
+ def attempt_id3_v2_extraction(io)
22
+ io.seek(0) # Only support header ID3v2
23
+ header = parse_id3_v2_header(io)
24
+ return unless header[:tag] == 'ID3' && header[:size] > 0
25
+ return unless ID3V2_TAG_VERSIONS.include?(header[:version])
26
+
27
+ id3_tag_size = io.pos + header[:size]
28
+
29
+ # Here we got to pay attention. The tag size encoded in
30
+ # the ID3 header is a 4-byte unsigned int. Meaning it
31
+ # can hold values up to 256 MB. We do not want to read
32
+ # that much since we are pulling that data into memory -
33
+ # and it would also make the parser easily exploitable.
34
+ # We will set a "hard" limit beyound which we will simply
35
+ # refuse to read those tags at all.
36
+ if id3_tag_size > MAX_SIZE_FOR_ID3V2
37
+ io.seek(id3_tag_size) # For reading the frames
38
+ return
39
+ end
40
+
41
+ io.seek(0)
42
+ blob = safe_read(io, id3_tag_size)
43
+
44
+ swallow_exceptions { ID3Tag.read(StringIO.new(blob), :v2) }
45
+ rescue FormatParser::IOUtils::InvalidRead
46
+ nil
47
+ end
48
+
49
+ def read_and_unpack_packspec(io, **packspec)
50
+ sizes = {'a' => 1, 'N' => 4}
51
+ n = packspec.values.map { |e| sizes.fetch(e[0]) * e[1].to_i }.inject(&:+)
52
+ byte_str = safe_read(io, n)
53
+
54
+ unpacked_values = byte_str.unpack(packspec.values.join)
55
+ Hash[packspec.keys.zip(unpacked_values)]
56
+ end
57
+
58
+ def parse_id3_v2_header(io)
59
+ fields = {tag: :a3, version: :a2, flags: :a1, syncsafe_size: :N1}
60
+ header_data = read_and_unpack_packspec(io, **fields)
61
+ header_data[:size] = ID3Tag::SynchsafeInteger.decode(header_data.delete(:syncsafe_size))
62
+ header_data
63
+ end
64
+
65
+ # We swallow exceptions from ID3Tag primarily because it does not have
66
+ # a single wrapping error class we could capture. We also do not touch our original
67
+ # IO object when working with ID3Tag
68
+ def swallow_exceptions
69
+ yield
70
+ rescue => e
71
+ warn(e)
72
+ nil
73
+ end
74
+
75
+ extend self
76
+ end
@@ -1,5 +1,4 @@
1
1
  class FormatParser::ZIPParser
2
- UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
3
2
  require_relative 'zip_parser/file_reader'
4
3
  require_relative 'zip_parser/office_formats'
5
4
 
@@ -40,7 +39,7 @@ class FormatParser::ZIPParser
40
39
 
41
40
  def decode_filename(filename, likely_unicode:)
42
41
  filename.force_encoding(Encoding::UTF_8) if likely_unicode
43
- filename.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
42
+ FormatParser.string_to_lossy_utf8(filename)
44
43
  end
45
44
 
46
45
  def decode_filename_of(zip_entry)
@@ -29,6 +29,24 @@ describe FormatParser::MP3Parser do
29
29
  expect(parsed.media_duration_seconds).to be_within(0.1).of(0.81)
30
30
  end
31
31
 
32
+ it 'does not attempt to read ID3V2 tags that are too large' do
33
+ more_bytes_than_permitted = 3 * 1024 * 1024
34
+ gunk = Random.new.bytes(more_bytes_than_permitted)
35
+
36
+ large_syncsfe_size = [ID3Tag::SynchsafeInteger.encode(more_bytes_than_permitted)].pack('N')
37
+ prepped = StringIO.new(
38
+ 'ID3' + "\x43\x00".b + "\x00".b + large_syncsfe_size + gunk
39
+ )
40
+
41
+ expect(ID3Tag).not_to receive(:read)
42
+
43
+ prepped.seek(0)
44
+ result = FormatParser::MP3Parser::ID3Extraction.attempt_id3_v2_extraction(prepped)
45
+
46
+ expect(result).to be_nil
47
+ expect(prepped.pos).to eq(3145738)
48
+ end
49
+
32
50
  it 'parses the Cassy MP3' do
33
51
  fpath = fixtures_dir + '/MP3/Cassy.mp3'
34
52
  parsed = subject.call(File.open(fpath, 'rb'))
@@ -39,10 +57,21 @@ describe FormatParser::MP3Parser do
39
57
  expect(parsed.format).to eq(:mp3)
40
58
  expect(parsed.num_audio_channels).to eq(2)
41
59
  expect(parsed.audio_sample_rate_hz).to eq(44100)
42
- expect(parsed.intrinsics).not_to be_nil
43
60
  expect(parsed.media_duration_seconds).to be_within(0.1).of(1102.46)
44
61
 
45
62
  expect(parsed.intrinsics).not_to be_nil
63
+
64
+ i = parsed.intrinsics
65
+ expect(i[:artist]).to eq('WeTransfer Studios/GIlles Peterson')
66
+ expect(i[:title]).to eq('Cassy')
67
+ expect(i[:album]).to eq('The Psychology of DJing')
68
+ expect(i[:comments]).to eq('0')
69
+ expect(i[:id3tags]).not_to be_nil
70
+
71
+ expect(parsed.intrinsics).not_to be_nil
72
+
73
+ # Make sure we are good with our JSON representation as well
74
+ JSON.pretty_generate(parsed)
46
75
  end
47
76
 
48
77
  it 'avoids returning a result when the parsed duration is infinite' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.4
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -39,6 +39,20 @@ dependencies:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
41
  version: '1.0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: id3tag
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.10'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.10'
42
56
  - !ruby/object:Gem::Dependency
43
57
  name: faraday
44
58
  requirement: !ruby/object:Gem::Requirement
@@ -186,8 +200,7 @@ files:
186
200
  - lib/parsers/moov_parser.rb
187
201
  - lib/parsers/moov_parser/decoder.rb
188
202
  - lib/parsers/mp3_parser.rb
189
- - lib/parsers/mp3_parser/id3_v1.rb
190
- - lib/parsers/mp3_parser/id3_v2.rb
203
+ - lib/parsers/mp3_parser/id3_extraction.rb
191
204
  - lib/parsers/pdf_parser.rb
192
205
  - lib/parsers/png_parser.rb
193
206
  - lib/parsers/psd_parser.rb
@@ -1,48 +0,0 @@
1
- module FormatParser::MP3Parser::ID3V1
2
- PACKSPEC = [
3
- :tag, :a3,
4
- :song_name, :a30,
5
- :artist, :a30,
6
- :album, :a30,
7
- :year, :N1,
8
- :comment, :a30,
9
- :genre, :C,
10
- ]
11
- packspec_keys = PACKSPEC.select.with_index { |_, i| i.even? }
12
- TAG_SIZE_BYTES = 128
13
-
14
- class TagInformation < Struct.new(*packspec_keys)
15
- end
16
-
17
- def attempt_id3_v1_extraction(io)
18
- return if io.size < TAG_SIZE_BYTES # Won't fit the ID3v1 regardless
19
-
20
- io.seek(io.size - 128)
21
- trailer_bytes = io.read(128)
22
-
23
- return unless trailer_bytes && trailer_bytes.byteslice(0, 3) == 'TAG'
24
-
25
- id3_v1 = parse_id3_v1(trailer_bytes)
26
-
27
- # If all of the resulting strings are empty this ID3v1 tag is invalid and
28
- # we should ignore it.
29
- strings_from_id3v1 = id3_v1.values.select { |e| e.is_a?(String) && e != 'TAG' }
30
- return if strings_from_id3v1.all?(&:empty?)
31
-
32
- id3_v1
33
- end
34
-
35
- def parse_id3_v1(byte_str)
36
- _keys, values = PACKSPEC.partition.with_index { |_, i| i.even? }
37
- unpacked_values = byte_str.unpack(values.join)
38
- unpacked_values.map! { |e| e.is_a?(String) ? trim_id3v1_string(e) : e }
39
- TagInformation.new(unpacked_values)
40
- end
41
-
42
- # Remove trailing whitespace and trailing nullbytes
43
- def trim_id3v1_string(str)
44
- str.tr("\x00".b, '').strip
45
- end
46
-
47
- extend self
48
- end
@@ -1,84 +0,0 @@
1
- module FormatParser::MP3Parser::ID3V2
2
- def attempt_id3_v2_extraction(io)
3
- io.seek(0) # Only support header ID3v2
4
- header_bytes = io.read(10)
5
- return unless header_bytes
6
-
7
- header = parse_id3_v2_header(header_bytes)
8
- return unless header[:tag] == 'ID3'
9
- return unless header[:size] > 0
10
-
11
- header_tag_payload = io.read(header[:size])
12
- header_tag_payload = StringIO.new(header_tag_payload)
13
-
14
- return unless header_tag_payload.size == header[:size]
15
-
16
- frames = []
17
- loop do
18
- break if header_tag_payload.eof?
19
- frame = parse_id3_v2_frame(header_tag_payload)
20
- # Some files include padding, which is there so that when you edit ID3v2
21
- # you do not have to overwrite the entire file - you can use this padding to
22
- # add some more tags or to grow the existing ones. In practice if we hit
23
- # something with a type of "0x00000000" we have entered the padding zone and
24
- # there is no point in parsing further
25
- if frame[:id] == "\x00\x00\x00\x00".b
26
- break
27
- else
28
- frames << frame
29
- end
30
- end
31
- frames
32
- end
33
-
34
- def parse_id3_v2_header(byte_str)
35
- packspec = [
36
- :tag, :a3,
37
- :version, :a2,
38
- :flags, :C1,
39
- :size, :a4,
40
- ]
41
- keys, values = packspec.partition.with_index { |_, i| i.even? }
42
- unpacked_values = byte_str.unpack(values.join)
43
- header_data = Hash[keys.zip(unpacked_values)]
44
-
45
- header_data[:version] = header_data[:version].unpack('C2')
46
- header_data[:size] = decode_syncsafe_int(header_data[:size])
47
-
48
- header_data
49
- end
50
-
51
- def parse_id3_v2_frame(io)
52
- id, syncsafe_size, flags = io.read(10).unpack('a4a4a2')
53
- size = decode_syncsafe_int(syncsafe_size)
54
- content = io.read(size)
55
- # It might so happen in sutations of terrible invalidity that we end up
56
- # with less data than advertised by the syncsafe size. We will just truck on.
57
- {id: id, size: size, flags: flags, content: content}
58
- end
59
-
60
- # ID3v2 uses "unsynchronized integers", which are unsigned integers smeared
61
- # over multiple bytes in such a manner that the first bit is always 0 (unset).
62
- # This is done so that ID3v2 incompatible decoders will not by accident see
63
- # the 0xFF0xFF0xFF0xFF sequence anywhere that can be mistaken for the MPEG frame
64
- # synchronisation header. Effectively it is a 7 bit big-endian unsigned integer
65
- # encoding.
66
- #
67
- # 8 bit 255 (0xFF) encoded in this mannner takes 16 bits instead,
68
- # and looks like this: `0b00000001 01111111`. Note how it avoids having
69
- # the first bit of the second byte be 1.
70
- # This method decodes an unsigned integer packed in this fashion
71
- def decode_syncsafe_int(bytes)
72
- size = 0
73
- j = 0
74
- i = bytes.bytesize - 1
75
- while i >= 0
76
- size += 128**i * (bytes.getbyte(j) & 0x7f)
77
- j += 1
78
- i -= 1
79
- end
80
- size
81
- end
82
-
83
- extend self
84
- end