format_parser 0.25.2 → 0.25.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/CONTRIBUTING.md +59 -22
- data/lib/format_parser.rb +12 -12
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/mp3_parser.rb +2 -0
- data/spec/parsers/mp3_parser_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2d0b2c07289221019c42f9546eee65b4ccd5c49aadc3c16f7e4192f356821bcb
|
4
|
+
data.tar.gz: 7163ca3bfac79fe5539979e5723902db8fe530ee721d09ba3b18cf635280ece6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb72a909878a032d6f832aa0a52bd0521df31faacff2f5d070e9b4f555d420ad11abecfc30e2074f7897250b530543a4892b60d37037f67b87dd8c9b10c2b8b
|
7
|
+
data.tar.gz: 24b69b8ad67b5d4461f63055b379af270a936172660d6350cc98ee2caa2c47ff3b7d94bf353de673ba331ba273b1e86090f83ef282933f261978250562aa22cd
|
data/CHANGELOG.md
CHANGED
data/CONTRIBUTING.md
CHANGED
@@ -83,32 +83,59 @@ of software. Ideally, this file is going to be something you have produced yours
|
|
83
83
|
and you are permitted to share under the MIT license provisions.
|
84
84
|
|
85
85
|
When writing a parser, please try to ensure it returns a usable result as soon as possible,
|
86
|
-
or
|
86
|
+
or `nil` as soon as possible (once you know the file is not fit for your specific parser).
|
87
87
|
Bear in mind that we enforce read budgets per-parser, so you will not be allowed to perform
|
88
88
|
too many reads, or perform reads which are too large.
|
89
89
|
|
90
|
-
In order to create new parsers,
|
90
|
+
In order to create new parsers, make a well-named class with an instance method `call`,
|
91
|
+
and to register a single instance of that class as the parser - so that only one object needs to be stored
|
92
|
+
in memory when parsing multiple inputs. In that case your object must be **thread-safe and stateless** - this
|
93
|
+
is really important since FormatParser is thread-safe and multiple parsing procedures may be in progress
|
94
|
+
concurrently against the same parser object. You can also create a Proc if your parser is fairly trivial.
|
91
95
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
+
If it will be difficult to have your parser thread-safe you can register your class itself as
|
97
|
+
the parser and define the `self.call` method to parse using a fresh instance every time, allowing
|
98
|
+
object-level state:
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
class MyParser
|
102
|
+
def self.call(io)
|
103
|
+
new.call(io)
|
104
|
+
end
|
105
|
+
|
106
|
+
def call(io)
|
107
|
+
@state = ...
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
`call` accepts a single argument - an IO-ish object which is guaranteed to respond to the same methods as the
|
112
|
+
ones defined in `IOConstraint` - that is, it is a strict subset of a standard Ruby IO object. *All reads from
|
113
|
+
this IO object are guaranteed to be returned in binary encoding.* The IO will be at offset of 0 when your parsing
|
114
|
+
proc receives it and there will be no concurrent calls to that object until your proc returns.
|
96
115
|
|
97
|
-
|
116
|
+
Your parsing procedure may read from this IO object, and should return either a `Result`-like object with
|
117
|
+
the file metadata (if it could recover any) or `nil` if it couldn't. All files pass
|
118
|
+
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
119
|
+
your method or `break` your Proc as early as possible. A blank `return` works fine too as it actually returns `nil`.
|
98
120
|
|
99
|
-
Your parser
|
100
|
-
and file natures it provides.
|
121
|
+
Your parser then needs to be registered using `FormatParser.register_parser` with the information on the formats
|
122
|
+
and file natures it provides. This allows FormatParser to skip your parser if, say, the user only want to parse for
|
123
|
+
`:image` nature files but your parser parses `:audio`.
|
101
124
|
|
102
|
-
Down below you can find the most basic parser implementation:
|
125
|
+
Down below you can find the most basic parser implementation which parses an imaginary `IMGA` file format:
|
103
126
|
|
104
127
|
```ruby
|
105
128
|
MyParser = ->(io) {
|
106
|
-
# ...
|
129
|
+
# ... Read the magic bytes from the start of IO - the IO is
|
130
|
+
# guaranteed to be fed to you at offset 0, start-of-file.
|
107
131
|
magic_bytes = io.read(4)
|
132
|
+
|
108
133
|
# breaking the block returns `nil` to the caller signaling "no match"
|
109
134
|
break if magic_bytes != 'IMGA'
|
110
135
|
|
136
|
+
# Our file format stores the width and height as 2 32-bit unsigned integers
|
111
137
|
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
138
|
+
|
112
139
|
# ...and return the FileInformation::Image object with the metadata.
|
113
140
|
FormatParser::Image.new(
|
114
141
|
format: :imga,
|
@@ -135,8 +162,8 @@ class MyParser
|
|
135
162
|
# ... do some parsing with `io`
|
136
163
|
# The instance will be discarded after parsing, so using instance variables
|
137
164
|
# is permitted - they are not shared between calls to `call`
|
138
|
-
|
139
|
-
break if
|
165
|
+
magic_bytes = io.read(4)
|
166
|
+
break if magic_bytes != 'IMGA'
|
140
167
|
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
141
168
|
FormatParser::Image.new(
|
142
169
|
format: :imga,
|
@@ -145,23 +172,33 @@ class MyParser
|
|
145
172
|
)
|
146
173
|
end
|
147
174
|
|
148
|
-
|
175
|
+
# Note that we register an instance of the class, not the class. It is the
|
176
|
+
# instance that responds to `call()` and we can do this because our object
|
177
|
+
# is stateless.
|
178
|
+
FormatParser.register_parser new, natures: :image, formats: :bmp
|
149
179
|
end
|
150
180
|
```
|
151
181
|
|
152
|
-
|
182
|
+
If your parser supports file types which have a known filename extension, you can add a method to it called `likely_match?`,
|
183
|
+
add this method on the object you register itself. For example, for the ZIP parser we use:
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
def likely_match?(filename)
|
187
|
+
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
|
188
|
+
end
|
189
|
+
```
|
153
190
|
|
154
|
-
|
191
|
+
If your parser matches the filename it is going to be applied *earlier*, saving time. Since most FormatParser users are
|
192
|
+
likely to only want the first result of the parsing, the sooner your parser gets applied - the sooner you can return the result,
|
193
|
+
avoiding unnecessary reads.
|
155
194
|
|
156
|
-
|
157
|
-
2) An object that responds to `new` and returns something that can be `call()`-ed with with an argument that conforms to `IOConstraint`.
|
195
|
+
### Calling convention for preparing parsers
|
158
196
|
|
159
|
-
|
197
|
+
A parser that gets registered using `register_parser` must be an object that can be `call()`-ed, with an argument that conforms to `IOConstraint`
|
160
198
|
|
161
199
|
FormatParser is made to be used in threaded environments, and if you use instance variables
|
162
|
-
you need your parser to be isolated from it's siblings in other threads -
|
163
|
-
|
164
|
-
|
200
|
+
you need your parser to be isolated from it's siblings in other threads - create a copy for one-off use inside
|
201
|
+
your `call` method.
|
165
202
|
|
166
203
|
## Pull requests
|
167
204
|
|
data/lib/format_parser.rb
CHANGED
@@ -36,7 +36,7 @@ module FormatParser
|
|
36
36
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
37
37
|
# provides out of the box registers itself using this method.
|
38
38
|
#
|
39
|
-
# @param
|
39
|
+
# @param callable_parser[#call] an object that responds to #call for parsing an IO
|
40
40
|
# @param formats[Array<Symbol>] file formats that the parser provides
|
41
41
|
# @param natures[Array<Symbol>] file natures that the parser provides
|
42
42
|
# @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
|
@@ -45,39 +45,39 @@ module FormatParser
|
|
45
45
|
# with a lower priority value will be applied first, and if a single result is requested, will also return
|
46
46
|
# first.
|
47
47
|
# @return void
|
48
|
-
def self.register_parser(
|
48
|
+
def self.register_parser(callable_parser, formats:, natures:, priority: LEAST_PRIORITY)
|
49
49
|
parser_provided_formats = Array(formats)
|
50
50
|
parser_provided_natures = Array(natures)
|
51
51
|
PARSER_MUX.synchronize do
|
52
52
|
@parsers ||= Set.new
|
53
|
-
@parsers <<
|
53
|
+
@parsers << callable_parser
|
54
54
|
@parsers_per_nature ||= {}
|
55
55
|
parser_provided_natures.each do |provided_nature|
|
56
56
|
@parsers_per_nature[provided_nature] ||= Set.new
|
57
|
-
@parsers_per_nature[provided_nature] <<
|
57
|
+
@parsers_per_nature[provided_nature] << callable_parser
|
58
58
|
end
|
59
59
|
@parsers_per_format ||= {}
|
60
60
|
parser_provided_formats.each do |provided_format|
|
61
61
|
@parsers_per_format[provided_format] ||= Set.new
|
62
|
-
@parsers_per_format[provided_format] <<
|
62
|
+
@parsers_per_format[provided_format] << callable_parser
|
63
63
|
end
|
64
64
|
@parser_priorities ||= {}
|
65
|
-
@parser_priorities[
|
65
|
+
@parser_priorities[callable_parser] = priority
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
69
|
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
70
70
|
# tests, but can also be used to forcibly disable some formats completely.
|
71
71
|
#
|
72
|
-
# @param
|
72
|
+
# @param callable_parser[#==] an object that is identity-equal to any other registered parser
|
73
73
|
# @return void
|
74
|
-
def self.deregister_parser(
|
74
|
+
def self.deregister_parser(callable_parser)
|
75
75
|
# Used only in tests
|
76
76
|
PARSER_MUX.synchronize do
|
77
|
-
(@parsers || []).delete(
|
78
|
-
(@parsers_per_nature || {}).values.map { |e| e.delete(
|
79
|
-
(@parsers_per_format || {}).values.map { |e| e.delete(
|
80
|
-
(@parser_priorities || {}).delete(
|
77
|
+
(@parsers || []).delete(callable_parser)
|
78
|
+
(@parsers_per_nature || {}).values.map { |e| e.delete(callable_parser) }
|
79
|
+
(@parsers_per_format || {}).values.map { |e| e.delete(callable_parser) }
|
80
|
+
(@parser_priorities || {}).delete(callable_parser)
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -73,6 +73,8 @@ class FormatParser::MP3Parser
|
|
73
73
|
id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
|
74
74
|
tags = [id3v1, ID3Extraction.attempt_id3_v2_extraction(io)].compact
|
75
75
|
|
76
|
+
io.seek(0) if tags.empty?
|
77
|
+
|
76
78
|
# Compute how many bytes are occupied by the actual MPEG frames
|
77
79
|
ignore_bytes_at_tail = id3v1 ? 128 : 0
|
78
80
|
ignore_bytes_at_head = io.pos
|
@@ -166,6 +166,18 @@ describe FormatParser::MP3Parser do
|
|
166
166
|
expect(parsed.artist). to eq('wetransfer')
|
167
167
|
end
|
168
168
|
|
169
|
+
it 'does not skip the first bytes if it is not a id3 tag header' do
|
170
|
+
fpath = fixtures_dir + '/MP3/no_id3_tags.mp3'
|
171
|
+
|
172
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
173
|
+
|
174
|
+
expect(parsed).not_to be_nil
|
175
|
+
|
176
|
+
expect(parsed.nature).to eq(:audio)
|
177
|
+
expect(parsed.format).to eq(:mp3)
|
178
|
+
expect(parsed.audio_sample_rate_hz).to eq(44100)
|
179
|
+
end
|
180
|
+
|
169
181
|
describe '#as_json' do
|
170
182
|
it 'converts all hash keys to string when stringify_keys: true' do
|
171
183
|
fpath = fixtures_dir + '/MP3/Cassy.mp3'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.25.
|
4
|
+
version: 0.25.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-10-
|
12
|
+
date: 2020-10-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|