format_parser 0.13.6 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 236f35fe657e5bb8f51cf08724fb3138f17b6a20605af4131a7643711f43cd93
4
- data.tar.gz: 65037da607c406be2bf0d8e7eb549537199a4ed8c97243c68b63c62e20bdb9e5
3
+ metadata.gz: 0d9daf0603ac099b75a9ddb85b8061190dc8fe2f3aad46633ff94b1f4a99020e
4
+ data.tar.gz: d4343aa08f9ec6a6864a5aed3b7ea174f779c3ea0954417e9b31a2c871126f27
5
5
  SHA512:
6
- metadata.gz: aaa8a5c25a9b9b6884e0ec22adf90390aa2a32e4268e7e1d01c98a5d88a20bb25a4635ca53fd891d28bae8de34287ff57c943af76c905425f41ade98985408d9
7
- data.tar.gz: 54a417ead7b3d12d585f6775feee2295d95f40fe9ac66a7a5e0788dfd6791271d95ddc1ef230a7c3e17c04635765916090db2a0cf98a722e5c7d662f80eb7d37
6
+ metadata.gz: 2d63a7d9802157e35260b91f5a8077008fa4e7c19837cc4a01ff0d3954fa22d29455a13cb914bc892782d986a7d85b9465eb21b8c753d16c0da774ddb0f5c47c
7
+ data.tar.gz: 651d41396efdeb4e9f74173d284617baa0bf34f3ce6e8a6890a1ceadf2ef4a3556a3dd0b4b282dc84ea9f65c020ecbe1ae942da78eed85bed6d969161dbb0cbb
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.14.0
2
+ * PDF: Reduce the PDF parser to the basic binary detection (PDF/not PDF) until we have a better/more robust PDF parser
3
+ * MP3: Fix the byte length of MPEG frames calculation to correctly account for ID3V1 and ID3V2 instead of ID3V1 twice
4
+ * MP3: Remove the workaround for `id3tag` choking on non-matching genre strings (bumps dependency on `id3tag`)
5
+ * Use Measurometer provided by the [measurometer gem](https://rubygems.org/gems/measurometer)
6
+ * Ogg: Add support for the Ogg format
7
+
1
8
  ## 0.13.6
2
9
  * Make all reads in the MOOV decoder strict - fail early if reads are improperly sized
3
10
  * Disable parsing for `udta` atoms in MP4/MOV since we do not have a good way of parsing them yet
data/README.md CHANGED
@@ -30,12 +30,13 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
30
30
  * M4A
31
31
  * ZIP
32
32
  * DOCX, PPTX, XLSX
33
+ * OGG
33
34
 
34
35
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
35
36
 
36
37
  ## Basic usage
37
38
 
38
- Pass an IO object that responds to `read` and `seek` to `FormatParser.parse` and the first confirmed match will be returned.
39
+ Pass an IO object that responds to `read`, `seek` and `size` to `FormatParser.parse` and the first confirmed match will be returned.
39
40
 
40
41
  ```ruby
41
42
  match = FormatParser.parse(File.open("myimage.jpg", "rb"))
@@ -107,6 +108,11 @@ Therefore we adapt the following approaches:
107
108
  is easier to verify and test, and we likely don't care about all the metadata anyway
108
109
  * Avoid using C libraries which are likely to contain buffer overflows/underflows - we stay memory safe
109
110
 
111
+ ## Acknowledgements
112
+
113
+ We are incredibly grateful to Remco van't Veer for [exifr](https://github.com/remvee/exifr) and to
114
+ Krists Ozols for [id3tag](https://github.com/krists/id3tag) that we are using for crucial tasks.
115
+
110
116
  ## Fixture Sources
111
117
 
112
118
  Unless specified otherwise in this section the fixture files are MIT licensed and from the FastImage and Dimensions projects.
@@ -145,6 +151,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
145
151
  - atc_fixture_vbr.flac is a converted version of the MP3 with the same name
146
152
  - c_11k16btipcm.flac is a converted version of the WAV with the same name
147
153
 
154
+ ### OGG
155
+ - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors
156
+
148
157
  ### M4A
149
158
  - fixture.m4a was created by one of the project maintainers and is MIT licensed
150
159
 
@@ -32,8 +32,9 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_dependency 'ks', '~> 0.0.1'
34
34
  spec.add_dependency 'exifr', '~> 1.0'
35
- spec.add_dependency 'id3tag', '~> 0.10'
35
+ spec.add_dependency 'id3tag', '~> 0.10', '>= 0.10.1'
36
36
  spec.add_dependency 'faraday', '~> 0.13'
37
+ spec.add_dependency 'measurometer', '~> 1'
37
38
 
38
39
  spec.add_development_dependency 'rspec', '~> 3.0'
39
40
  spec.add_development_dependency 'rake', '~> 12'
data/lib/care.rb CHANGED
@@ -173,10 +173,10 @@ class Care
173
173
  # @param io[IO] the IO to read from
174
174
  # @param page_i[Integer] which page (zero-based) to read
175
175
  def read_page(io, page_i)
176
- FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
176
+ Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
177
177
 
178
178
  io.seek(page_i * @page_size)
179
- read_result = io.read(@page_size)
179
+ read_result = Measurometer.instrument('format_parser.Care.read_page') { io.read(@page_size) }
180
180
  if read_result.nil?
181
181
  # If the read went past the end of the IO the read result will be nil,
182
182
  # so we know our IO is exhausted here
data/lib/format_parser.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'set'
2
+ require 'measurometer'
2
3
 
3
4
  # A pretty nimble module for parsing file metadata using partial reads. Contains all the
4
5
  # top-level methods of the library.
@@ -17,6 +18,10 @@ module FormatParser
17
18
  require_relative 'io_constraint'
18
19
  require_relative 'care'
19
20
 
21
+ # Define Measurometer in the internal namespace as well
22
+ # so that we stay compatible for the applications that use it
23
+ const_set(:Measurometer, ::Measurometer)
24
+
20
25
  # Is used to manage access to the shared array of parser constructors, which might
21
26
  # potentially be mutated from different threads. The mutex won't be hit too often
22
27
  # since it only locks when adding/removing parsers.
@@ -95,10 +100,9 @@ module FormatParser
95
100
  # @param formats[Array] an array of file formats to scope the parsing to.
96
101
  # For example `[:jpg, :tif]` will scope the parsing to TIFF and JPEG files.
97
102
  # The default value is "all formats known to FormatParser"
98
- # @param results[:first, :all, Integer] one of the values defining how many results to return if parsing
99
- # is ambiguous. The default is `:first` which returns the first matching result. Other
100
- # possible values are `:all` to get all possible results and an Integer to return
101
- # at most N results.
103
+ # @param results[:first, :all] one of the values defining how many results to return if parsing
104
+ # is ambiguous. The default is `:first` which returns the first matching result. `:all` will return all results.
105
+ # When using `:first` parsing will stop at the first successful match and other parsers won't run.
102
106
  # @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
103
107
  # one should be good for most cases.
104
108
  # @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
@@ -251,7 +255,4 @@ module FormatParser
251
255
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
252
256
  require parser_file
253
257
  end
254
- # The Measurometer latches itself onto existing classes, so load it after
255
- # we have loaded all the parsers
256
- require_relative 'measurometer'
257
258
  end
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.13.6'
2
+ VERSION = '0.14.0'
3
3
  end
@@ -74,7 +74,9 @@ module FormatParser::EXIFParser
74
74
  EXIFR.logger = Logger.new(nil)
75
75
 
76
76
  def exif_from_tiff_io(constrained_io)
77
- raw_exif_data = EXIFR::TIFF.new(IOExt.new(constrained_io))
78
- raw_exif_data ? EXIFResult.new(raw_exif_data) : nil
77
+ Measurometer.instrument('format_parser.EXIFParser.exif_from_tiff_io') do
78
+ raw_exif_data = EXIFR::TIFF.new(IOExt.new(constrained_io))
79
+ raw_exif_data ? EXIFResult.new(raw_exif_data) : nil
80
+ end
79
81
  end
80
82
  end
@@ -60,7 +60,7 @@ class FormatParser::JPEGParser
60
60
  end
61
61
  end
62
62
 
63
- FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_read_until_capture', @buf.pos)
63
+ Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_read_until_capture', @buf.pos)
64
64
 
65
65
  # Return at the earliest possible opportunity
66
66
  if @width && @height
@@ -137,7 +137,7 @@ class FormatParser::JPEGParser
137
137
  # ...and only then read the marker contents and parse it as EXIF
138
138
  exif_data = safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize)
139
139
 
140
- FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', exif_data.bytesize)
140
+ Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', exif_data.bytesize)
141
141
 
142
142
  @exif_data = exif_from_tiff_io(StringIO.new(exif_data))
143
143
  rescue EXIFR::MalformedTIFF
@@ -27,7 +27,9 @@ class FormatParser::MOOVParser
27
27
  # size that gets parsed just before.
28
28
  max_read_offset = 0xFFFFFFFF
29
29
  decoder = Decoder.new
30
- atom_tree = decoder.extract_atom_stream(io, max_read_offset)
30
+ atom_tree = Measurometer.instrument('format_parser.Decoder.extract_atom_stream') do
31
+ decoder.extract_atom_stream(io, max_read_offset)
32
+ end
31
33
 
32
34
  ftyp_atom = decoder.find_first_atom_by_path(atom_tree, 'ftyp')
33
35
  file_type = ftyp_atom.field_value(:major_brand)
@@ -43,10 +43,7 @@ class FormatParser::MP3Parser
43
43
  def to_h
44
44
  tag = __getobj__
45
45
  MEMBERS.each_with_object({}) do |k, h|
46
- # ID3Tag sometimes raises when trying to find an unknown genre.
47
- # If this guard is removed, it fails when trying to do a gsub on a nil,
48
- # in /lib/id3tag/frames/v2/genre_frame/genre_parser_pre_24.rb:25:in `just_genres'
49
- value = tag.public_send(k) rescue nil
46
+ value = tag.public_send(k)
50
47
  h[k] = value if value
51
48
  end
52
49
  end
@@ -74,7 +71,7 @@ class FormatParser::MP3Parser
74
71
  # Compute how many bytes are occupied by the actual MPEG frames
75
72
  ignore_bytes_at_tail = id3v1 ? 128 : 0
76
73
  ignore_bytes_at_head = io.pos
77
- bytes_used_by_frames = io.size - ignore_bytes_at_tail - ignore_bytes_at_tail
74
+ bytes_used_by_frames = io.size - ignore_bytes_at_head - ignore_bytes_at_tail
78
75
 
79
76
  io.seek(ignore_bytes_at_head)
80
77
 
@@ -0,0 +1,218 @@
1
+ # https://xiph.org/vorbis/doc/Vorbis_I_spec.pdf
2
+ # https://en.wikipedia.org/wiki/Ogg#Page_structure
3
+ class FormatParser::OggParser
4
+ include FormatParser::IOUtils
5
+
6
+ # Maximum size of an Ogg page
7
+ MAX_POSSIBLE_PAGE_SIZE = 65307
8
+
9
+ def call(io)
10
+ # The format consists of chunks of data each called an "Ogg page". Each page
11
+ # begins with the characters, "OggS", to identify the file as Ogg format.
12
+ capture_pattern = safe_read(io, 4)
13
+ return unless capture_pattern == 'OggS'
14
+
15
+ io.seek(28) # skip not important bytes
16
+
17
+ # Each header packet begins with the same header fields.
18
+ # 1) packet_type: 8 bit value (the identification header is type 1)
19
+ # 2) the characters v','o','r','b','i','s' as six octets
20
+ packet_type, vorbis, _vorbis_version, channels, sample_rate = safe_read(io, 16).unpack('Ca6VCV')
21
+ return unless packet_type == 1 && vorbis == 'vorbis'
22
+
23
+ # In order to calculate the audio duration we have to read a
24
+ # granule_position of the last Ogg page of the file. Unfortunately, we don't
25
+ # know where the last page starts. But we do know that max size of an Ogg
26
+ # page is 65307 bytes. So we read the last 65307 bytes from the file and try
27
+ # to find the last page in this tail.
28
+ pos = io.size - MAX_POSSIBLE_PAGE_SIZE
29
+ pos = 0 if pos < 0
30
+ io.seek(pos)
31
+ tail = io.read(MAX_POSSIBLE_PAGE_SIZE)
32
+ return unless tail
33
+
34
+ granule_position = find_last_granule_position(tail)
35
+ return unless granule_position
36
+
37
+ duration = granule_position / sample_rate.to_f
38
+ return if duration == Float::INFINITY
39
+
40
+ FormatParser::Audio.new(
41
+ format: :ogg,
42
+ audio_sample_rate_hz: sample_rate,
43
+ num_audio_channels: channels,
44
+ media_duration_seconds: duration
45
+ )
46
+ end
47
+
48
+ private
49
+
50
+ def all_indices_of_substr_in_str(of_substring, in_string)
51
+ last_i = 0
52
+ found_at_indices = []
53
+ while last_i = in_string.index(of_substring, last_i)
54
+ found_at_indices << last_i
55
+ last_i += of_substring.bytesize
56
+ end
57
+ found_at_indices
58
+ end
59
+
60
+ # Returns granule_position of the last valid Ogg page contained in the given
61
+ # tail. Since the tail may contain multiple "OggS" entries the method searches
62
+ # them recursively starting from the end. The search stops when the first
63
+ # valid Oggs page is found.
64
+ #
65
+ # The granule position contains the offset of the page in terms of the
66
+ # number of samples from the start of file. So once we know that number
67
+ # we can estimate how long the file is. We _do_ need to add the number
68
+ # of samples the granule covers though
69
+ def find_last_granule_position(in_string)
70
+ # The Ogg page always starts with "OggS". Find all of them
71
+ # in the given tail, since we want to scan "tail to head" -
72
+ # starting with the last index and going down to the first
73
+ rev_indices = all_indices_of_substr_in_str('OggS', in_string).reverse
74
+ rev_indices.each do |idx|
75
+ if granule_pos = extract_granule_position_from_string_at(in_string, idx)
76
+ return granule_pos
77
+ end
78
+ end
79
+ nil # Nothing matched or the list of indices was empty
80
+ end
81
+
82
+ # Since the magic bits may occur inside the body of the page we have to
83
+ # validate that what we found is actually an Ogg page by calculating the
84
+ # checksum. For this reason we have to read the entire page and calculate
85
+ # its checksum. In order to read the entire Ogg page we first have to read a
86
+ # part of its header to find out the size of the page.
87
+ def extract_granule_position_from_string_at(string, at)
88
+ header_size = 27
89
+ header_bytes = string.byteslice(at, header_size)
90
+ return unless header_bytes && header_bytes.bytesize == header_size
91
+
92
+ # Read the Ogg page header excluding the segment table (in other words read
93
+ # first 27 bytes). See https://en.wikipedia.org/wiki/Ogg#Page_structure
94
+ _capture_pattern,
95
+ _version,
96
+ _header_type,
97
+ granule_position,
98
+ _bitstream_serial_number,
99
+ _page_sequence_number,
100
+ checksum,
101
+ num_bytes_page_segments = header_bytes.unpack('a4CCQ<VVVC')
102
+
103
+ # Read the segment table part of the Ogg page header. Its size is stored in page_segments.
104
+ #
105
+ # The segment table is a vector of 8-bit values, each indicating the length
106
+ # of the corresponding segment within the page body.
107
+ # If there are no segments in the segment table the page is certainly invalid
108
+ return if num_bytes_page_segments == 0
109
+
110
+ # Read the segment table
111
+ segment_table_pos = at + header_size
112
+ segment_table = string.byteslice(segment_table_pos, num_bytes_page_segments)
113
+ return unless segment_table && segment_table.bytesize == num_bytes_page_segments
114
+
115
+ # Calculate the size of the Ogg page
116
+ num_bytes_used_for_segments = segment_table.unpack('C*').inject(&:+)
117
+ page_size = header_size + num_bytes_page_segments + num_bytes_used_for_segments
118
+
119
+ # Read the entire page now that we know how much we have to read
120
+ entire_page = string.byteslice(at, page_size)
121
+ return unless entire_page && entire_page.bytesize == page_size
122
+
123
+ # Compute and check the checksum. If this check fails it means one of the two:
124
+ # - the data is corrupted
125
+ # - the "OggS" capture pattern occures inside the body of the page and is
126
+ # we were scanning a random piece of content which was not an Ogg page
127
+ return unless checksum == calculate_checksum(entire_page)
128
+
129
+ # ...and only having gone through all these motions - return the granule position.
130
+ granule_position
131
+ end
132
+
133
+ # Calculate the CRC using the 0x04C11DB7 polynomial. We cannot use Zlib since
134
+ # it generates different checksums. Copied from https://github.com/anibali/ruby-ogg
135
+ def calculate_checksum(data)
136
+ crc_reg = 0
137
+ data.each_byte.with_index do |byte, i|
138
+ # The checksum is calculated over _the entire page_ but with the
139
+ # placeholder for the checksum - the 4 bytes - zeroed out. The checksum
140
+ # is then substituted _into_ the page at that offset. So when we go
141
+ # over bytes at these offsets we will substitute them with 0s
142
+ b = (22..25).cover?(i) ? 0 : byte
143
+ crc_reg = (crc_reg << 8) ^ CRC_LOOKUP[((crc_reg >> 24) & 0xff) ^ b]
144
+ crc_reg = crc_reg % 2**32
145
+ end
146
+
147
+ crc_reg
148
+ end
149
+
150
+ CRC_LOOKUP = [
151
+ 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
152
+ 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
153
+ 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
154
+ 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
155
+ 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
156
+ 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
157
+ 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
158
+ 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
159
+ 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
160
+ 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
161
+ 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
162
+ 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
163
+ 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
164
+ 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
165
+ 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
166
+ 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
167
+ 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
168
+ 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
169
+ 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
170
+ 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
171
+ 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
172
+ 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
173
+ 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
174
+ 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
175
+ 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
176
+ 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
177
+ 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
178
+ 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
179
+ 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
180
+ 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
181
+ 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
182
+ 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
183
+ 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
184
+ 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
185
+ 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
186
+ 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
187
+ 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
188
+ 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
189
+ 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
190
+ 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
191
+ 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
192
+ 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
193
+ 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
194
+ 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
195
+ 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
196
+ 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
197
+ 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
198
+ 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
199
+ 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
200
+ 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
201
+ 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
202
+ 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
203
+ 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
204
+ 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
205
+ 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
206
+ 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
207
+ 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
208
+ 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
209
+ 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
210
+ 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
211
+ 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
212
+ 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
213
+ 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
214
+ 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
215
+ ].freeze
216
+
217
+ FormatParser.register_parser self, natures: :audio, formats: :ogg
218
+ end
@@ -9,67 +9,12 @@ class FormatParser::PDFParser
9
9
  #
10
10
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
11
11
 
12
- # Page counts have different markers depending on
13
- # the PDF type. There is not a single common way of solving
14
- # this. The only way of solving this correctly is by adding
15
- # different types of PDF's in the specs.
16
- #
17
- COUNT_MARKERS = ['Count ']
18
- EOF_MARKER = '%EOF'
19
-
20
12
  def call(io)
21
13
  io = FormatParser::IOConstraint.new(io)
22
14
 
23
15
  return unless safe_read(io, 9) =~ PDF_MARKER
24
16
 
25
- attributes = scan_for_attributes(io)
26
-
27
- FormatParser::Document.new(
28
- format: :pdf,
29
- page_count: attributes[:page_count]
30
- )
31
- end
32
-
33
- private
34
-
35
- # Read ahead bytes until one of % or / is reached.
36
- # A header in a PDF always starts with a /
37
- # The % is to detect the EOF
38
- #
39
- def scan_for_attributes(io)
40
- result = {}
41
-
42
- while read = safe_read(io, 1)
43
- case read
44
- when '%'
45
- break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
46
- when '/'
47
- find_page_count(io, result)
48
- end
49
- end
50
-
51
- result
52
- end
53
-
54
- def find_page_count(io, result)
55
- COUNT_MARKERS.each do |marker|
56
- if safe_read(io, marker.size) == marker
57
- result[:page_count] = read_numbers(io)
58
- end
59
- end
60
- end
61
-
62
- # Read ahead bytes until no more numbers are found
63
- # This assumes that the position of io starts at a
64
- # number
65
- def read_numbers(io)
66
- numbers = ''
67
-
68
- while c = safe_read(io, 1)
69
- c =~ /\d+/ ? numbers << c : break
70
- end
71
-
72
- numbers.to_i
17
+ FormatParser::Document.new(format: :pdf)
73
18
  end
74
19
 
75
20
  FormatParser.register_parser self, natures: :document, formats: :pdf
data/lib/read_limiter.rb CHANGED
@@ -77,9 +77,9 @@ class FormatParser::ReadLimiter
77
77
  # `format_parser.TIFF.read_limiter.num_seeks` and so forth
78
78
  # @return void
79
79
  def send_metrics(prefix)
80
- FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_seeks' % prefix, @seeks)
81
- FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_reads' % prefix, @reads)
82
- FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.read_bytes' % prefix, @bytes)
80
+ Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_seeks' % prefix, @seeks)
81
+ Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_reads' % prefix, @reads)
82
+ Measurometer.add_distribution_value('format_parser.%s.read_limiter.read_bytes' % prefix, @bytes)
83
83
  end
84
84
 
85
85
  # Resets all the recorded call counters so that the object can be reused for the next parser,
data/lib/remote_io.rb CHANGED
@@ -60,7 +60,7 @@ class FormatParser::RemoteIO
60
60
  # @return [String] the read bytes
61
61
  def read(n_bytes)
62
62
  http_range = (@pos..(@pos + n_bytes - 1))
63
- maybe_size, maybe_body = request_range(http_range)
63
+ maybe_size, maybe_body = Measurometer.instrument('format_parser.RemoteIO.read') { request_range(http_range) }
64
64
  if maybe_size && maybe_body
65
65
  @remote_size = maybe_size
66
66
  @pos += maybe_body.bytesize
@@ -103,10 +103,10 @@ class FormatParser::RemoteIO
103
103
  # cannot hint size with this response - at lease not when working with S3
104
104
  return
105
105
  when 500..599
106
- FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
106
+ Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
107
107
  raise IntermittentFailure.new(response.status, "Server at #{@uri} replied with a #{response.status} and we might want to retry")
108
108
  else
109
- FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
109
+ Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
110
110
  raise InvalidRequest.new(response.status, "Server at #{@uri} replied with a #{response.status} and refused our request")
111
111
  end
112
112
  end
@@ -5,6 +5,10 @@ describe FormatParser do
5
5
  expect(FormatParser::VERSION).to be_kind_of(String)
6
6
  end
7
7
 
8
+ it 'exposes the Measurometer constant' do
9
+ expect(FormatParser::Measurometer).to be_kind_of(Module)
10
+ end
11
+
8
12
  describe '.parse' do
9
13
  it 'returns nil when trying to parse an empty IO' do
10
14
  d = StringIO.new('')
@@ -57,7 +57,7 @@ describe FormatParser::MP3Parser do
57
57
  expect(parsed.format).to eq(:mp3)
58
58
  expect(parsed.num_audio_channels).to eq(2)
59
59
  expect(parsed.audio_sample_rate_hz).to eq(44100)
60
- expect(parsed.media_duration_seconds).to be_within(0.1).of(1102.46)
60
+ expect(parsed.media_duration_seconds).to be_within(0.1).of(1098.03)
61
61
 
62
62
  expect(parsed.intrinsics).not_to be_nil
63
63
 
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::OggParser do
4
+ it 'parses an ogg file' do
5
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/vorbis.ogg', 'rb'))
6
+
7
+ expect(parse_result.nature).to eq(:audio)
8
+ expect(parse_result.format).to eq(:ogg)
9
+ expect(parse_result.num_audio_channels).to eq(1)
10
+ expect(parse_result.audio_sample_rate_hz).to eq(16000)
11
+ expect(parse_result.media_duration_seconds).to be_within(0.01).of(2973.95)
12
+ end
13
+
14
+ it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
15
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_garbage_at_the_end.ogg', 'rb'))
16
+ expect(parse_result).to be_nil
17
+ end
18
+
19
+ it "correctly parses an ogg file when a magic string occurs in the page's body" do
20
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_confusing_magic_string.ogg', 'rb'))
21
+
22
+ expect(parse_result.nature).to eq(:audio)
23
+ expect(parse_result.format).to eq(:ogg)
24
+ expect(parse_result.num_audio_channels).to eq(1)
25
+ expect(parse_result.audio_sample_rate_hz).to eq(8000)
26
+ expect(parse_result.media_duration_seconds).to be_within(0.01).of(0.45)
27
+ end
28
+ end
@@ -18,10 +18,6 @@ describe FormatParser::PDFParser do
18
18
  expect(parsed_pdf.nature).to eq(:document)
19
19
  expect(parsed_pdf.format).to eq(:pdf)
20
20
  end
21
-
22
- it 'has a correct page count' do
23
- expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
24
- end
25
21
  end
26
22
 
27
23
  describe 'a PDF file with a missing version header' do
@@ -44,25 +40,9 @@ describe FormatParser::PDFParser do
44
40
  pending 'does not parse succesfully'
45
41
  end
46
42
 
47
- describe 'a PDF file with a missing COUNT_HEADER' do
48
- let(:pdf_file) { 'missing_page_count.pdf' }
49
-
50
- it 'does not return a page count' do
51
- expect(parsed_pdf.page_count).to eq(nil)
52
- end
53
- end
54
-
55
43
  describe 'parses a PDF file' do
56
44
  describe 'a single page file' do
57
- include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
58
- end
59
-
60
- describe 'a multi page pdf file' do
61
- include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
62
- end
63
-
64
- describe 'a multi page pdf file with content' do
65
- include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
45
+ include_examples :behave_like_pdf, file: '1_page.pdf'
66
46
  end
67
47
  end
68
48
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.6
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-05-28 00:00:00.000000000 Z
12
+ date: 2018-06-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -46,6 +46,9 @@ dependencies:
46
46
  - - "~>"
47
47
  - !ruby/object:Gem::Version
48
48
  version: '0.10'
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 0.10.1
49
52
  type: :runtime
50
53
  prerelease: false
51
54
  version_requirements: !ruby/object:Gem::Requirement
@@ -53,6 +56,9 @@ dependencies:
53
56
  - - "~>"
54
57
  - !ruby/object:Gem::Version
55
58
  version: '0.10'
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 0.10.1
56
62
  - !ruby/object:Gem::Dependency
57
63
  name: faraday
58
64
  requirement: !ruby/object:Gem::Requirement
@@ -67,6 +73,20 @@ dependencies:
67
73
  - - "~>"
68
74
  - !ruby/object:Gem::Version
69
75
  version: '0.13'
76
+ - !ruby/object:Gem::Dependency
77
+ name: measurometer
78
+ requirement: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1'
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1'
70
90
  - !ruby/object:Gem::Dependency
71
91
  name: rspec
72
92
  requirement: !ruby/object:Gem::Requirement
@@ -186,7 +206,6 @@ files:
186
206
  - lib/image.rb
187
207
  - lib/io_constraint.rb
188
208
  - lib/io_utils.rb
189
- - lib/measurometer.rb
190
209
  - lib/parsers/aiff_parser.rb
191
210
  - lib/parsers/bmp_parser.rb
192
211
  - lib/parsers/cr2_parser.rb
@@ -201,6 +220,7 @@ files:
201
220
  - lib/parsers/moov_parser/decoder.rb
202
221
  - lib/parsers/mp3_parser.rb
203
222
  - lib/parsers/mp3_parser/id3_extraction.rb
223
+ - lib/parsers/ogg_parser.rb
204
224
  - lib/parsers/pdf_parser.rb
205
225
  - lib/parsers/png_parser.rb
206
226
  - lib/parsers/psd_parser.rb
@@ -220,7 +240,6 @@ files:
220
240
  - spec/format_parser_inspect_spec.rb
221
241
  - spec/format_parser_spec.rb
222
242
  - spec/io_utils_spec.rb
223
- - spec/measurometer_spec.rb
224
243
  - spec/parsers/aiff_parser_spec.rb
225
244
  - spec/parsers/bmp_parser_spec.rb
226
245
  - spec/parsers/cr2_parser_spec.rb
@@ -232,6 +251,7 @@ files:
232
251
  - spec/parsers/jpeg_parser_spec.rb
233
252
  - spec/parsers/moov_parser_spec.rb
234
253
  - spec/parsers/mp3_parser_spec.rb
254
+ - spec/parsers/ogg_parser_spec.rb
235
255
  - spec/parsers/pdf_parser_spec.rb
236
256
  - spec/parsers/png_parser_spec.rb
237
257
  - spec/parsers/psd_parser_spec.rb
data/lib/measurometer.rb DELETED
@@ -1,100 +0,0 @@
1
- class FormatParser::Measurometer
2
- class << self
3
- # Permits adding instrumentation drivers. Measurometer is 1-1 API
4
- # compatible with Appsignal, which we use a lot. So to magically
5
- # obtain all Appsignal instrumentation, add the Appsignal module
6
- # as a driver.
7
- #
8
- # Measurometer.drivers << Appsignal
9
- #
10
- # A driver must be reentrant and thread-safe - it should be possible
11
- # to have multiple `instrument` calls open from different threads at the
12
- # same time.
13
- # The driver must support the same interface as the Measurometer class
14
- # itself, minus the `drivers` and `instrument_instance_method` methods.
15
- #
16
- # @return Array
17
- def drivers
18
- @drivers ||= []
19
- @drivers
20
- end
21
-
22
- # Runs a given block within a cascade of `instrument` blocks of all the
23
- # added drivers.
24
- #
25
- # Measurometer.instrument('do_foo') { compute! }
26
- #
27
- # unfolds to
28
- # Appsignal.instrument('do_foo') do
29
- # Statsd.timing('do_foo') do
30
- # compute!
31
- # end
32
- # end
33
- #
34
- # A driver must be reentrant and thread-safe - it should be possible
35
- # to have multiple `instrument` calls open from different threads at the
36
- # same time.
37
- # The driver must support the same interface as the Measurometer class
38
- # itself, minus the `drivers` and `instrument_instance_method` methods.
39
- #
40
- # @param block_name[String] under which path to push the metric
41
- # @param blk[#call] the block to instrument
42
- # @return [Object] the return value of &blk
43
- def instrument(block_name, &blk)
44
- return yield unless @drivers && @drivers.any? # The block wrapping business is not free
45
- @drivers.inject(blk) { |outer_block, driver|
46
- -> {
47
- driver.instrument(block_name, &outer_block)
48
- }
49
- }.call
50
- end
51
-
52
- # Adds a distribution value (sample) under a given path
53
- #
54
- # @param value_path[String] under which path to push the metric
55
- # @param value[Numeric] distribution value
56
- # @return nil
57
- def add_distribution_value(value_path, value)
58
- (@drivers || []).each { |d| d.add_distribution_value(value_path, value) }
59
- nil
60
- end
61
-
62
- # Increment a named counter under a given path
63
- #
64
- # @param counter_path[String] under which path to push the metric
65
- # @param by[Integer] the counter increment to apply
66
- # @return nil
67
- def increment_counter(counter_path, by)
68
- (@drivers || []).each { |d| d.increment_counter(counter_path, by) }
69
- nil
70
- end
71
-
72
- # Wrap an anonymous module around an instance method in the given class to have
73
- # it instrumented automatically. The name of the measurement will be interpolated as:
74
- #
75
- # "#{prefix}.#{rightmost_class_constant_name}.#{instance_method_name}"
76
- #
77
- # @param target_class[Class] the class to instrument
78
- # @param instance_method_name_to_instrument[Symbol] the method name to instrument
79
- # @param path_prefix[String] under which path to push the instrumented metric
80
- # @return void
81
- def instrument_instance_method(target_class, instance_method_name_to_instrument, path_prefix)
82
- short_class_name = target_class.to_s.split('::').last
83
- instrumentation_name = [path_prefix, short_class_name, instance_method_name_to_instrument].join('.')
84
- instrumenter_module = Module.new do
85
- define_method(instance_method_name_to_instrument) do |*any|
86
- ::FormatParser::Measurometer.instrument(instrumentation_name) { super(*any) }
87
- end
88
- end
89
- target_class.prepend(instrumenter_module)
90
- end
91
- end
92
-
93
- # Instrument things interesting in the global sense
94
- instrument_instance_method(FormatParser::RemoteIO, :read, 'format_parser')
95
- instrument_instance_method(Care::Cache, :read_page, 'format_parser')
96
-
97
- # Instrument more specific things on a per-parser basis
98
- instrument_instance_method(FormatParser::EXIFParser, :scan_image_tiff, 'format_parser')
99
- instrument_instance_method(FormatParser::MOOVParser::Decoder, :extract_atom_stream, 'format_parser.parsers.MOOVParser')
100
- end
@@ -1,48 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe FormatParser::Measurometer do
4
- RSpec::Matchers.define :include_counter_or_measurement_named do |named|
5
- match do |actual|
6
- actual.any? do |e|
7
- e[0] == named && e[1] > 0
8
- end
9
- end
10
- end
11
-
12
- it 'instruments a full cycle FormatParser.parse' do
13
- driver_class = Class.new do
14
- attr_accessor :timings, :counters, :distributions
15
- def instrument(block_name)
16
- s = Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
- yield.tap do
18
- delta = Process.clock_gettime(Process::CLOCK_MONOTONIC) - s
19
- @timings ||= []
20
- @timings << [block_name, delta * 1000]
21
- end
22
- end
23
-
24
- def add_distribution_value(value_path, value)
25
- @distributions ||= []
26
- @distributions << [value_path, value]
27
- end
28
-
29
- def increment_counter(value_path, value)
30
- @counters ||= []
31
- @counters << [value_path, value]
32
- end
33
- end
34
-
35
- instrumenter = driver_class.new
36
- described_class.drivers << instrumenter
37
-
38
- FormatParser.parse(File.open(fixtures_dir + 'JPEG/keynote_recognized_as_jpeg.key', 'rb'), results: :all)
39
-
40
- described_class.drivers.delete(instrumenter)
41
- expect(described_class.drivers).not_to include(instrumenter)
42
-
43
- expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.detected_formats.zip')
44
- expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.parser.Care.page_reads_from_upsteam')
45
- expect(instrumenter.distributions).to include_counter_or_measurement_named('format_parser.ZIPParser.read_limiter.read_bytes')
46
- expect(instrumenter.timings).to include_counter_or_measurement_named('format_parser.Cache.read_page')
47
- end
48
- end