format_parser 0.25.2 → 0.25.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/CONTRIBUTING.md +59 -22
- data/lib/format_parser.rb +12 -12
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/mp3_parser.rb +2 -0
- data/spec/parsers/mp3_parser_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2d0b2c07289221019c42f9546eee65b4ccd5c49aadc3c16f7e4192f356821bcb
|
4
|
+
data.tar.gz: 7163ca3bfac79fe5539979e5723902db8fe530ee721d09ba3b18cf635280ece6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb72a909878a032d6f832aa0a52bd0521df31faacff2f5d070e9b4f555d420ad11abecfc30e2074f7897250b530543a4892b60d37037f67b87dd8c9b10c2b8b
|
7
|
+
data.tar.gz: 24b69b8ad67b5d4461f63055b379af270a936172660d6350cc98ee2caa2c47ff3b7d94bf353de673ba331ba273b1e86090f83ef282933f261978250562aa22cd
|
data/CHANGELOG.md
CHANGED
data/CONTRIBUTING.md
CHANGED
@@ -83,32 +83,59 @@ of software. Ideally, this file is going to be something you have produced yours
|
|
83
83
|
and you are permitted to share under the MIT license provisions.
|
84
84
|
|
85
85
|
When writing a parser, please try to ensure it returns a usable result as soon as possible,
|
86
|
-
or
|
86
|
+
or `nil` as soon as possible (once you know the file is not fit for your specific parser).
|
87
87
|
Bear in mind that we enforce read budgets per-parser, so you will not be allowed to perform
|
88
88
|
too many reads, or perform reads which are too large.
|
89
89
|
|
90
|
-
In order to create new parsers,
|
90
|
+
In order to create new parsers, make a well-named class with an instance method `call`,
|
91
|
+
and to register a single instance of that class as the parser - so that only one object needs to be stored
|
92
|
+
in memory when parsing multiple inputs. In that case your object must be **thread-safe and stateless** - this
|
93
|
+
is really important since FormatParser is thread-safe and multiple parsing procedures may be in progress
|
94
|
+
concurrently against the same parser object. You can also create a Proc if your parser is fairly trivial.
|
91
95
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
+
If it will be difficult to have your parser thread-safe you can register your class itself as
|
97
|
+
the parser and define the `self.call` method to parse using a fresh instance every time, allowing
|
98
|
+
object-level state:
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
class MyParser
|
102
|
+
def self.call(io)
|
103
|
+
new.call(io)
|
104
|
+
end
|
105
|
+
|
106
|
+
def call(io)
|
107
|
+
@state = ...
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
`call` accepts a single argument - an IO-ish object which is guaranteed to respond to the same methods as the
|
112
|
+
ones defined in `IOConstraint` - that is, it is a strict subset of a standard Ruby IO object. *All reads from
|
113
|
+
this IO object are guaranteed to be returned in binary encoding.* The IO will be at offset of 0 when your parsing
|
114
|
+
proc receives it and there will be no concurrent calls to that object until your proc returns.
|
96
115
|
|
97
|
-
|
116
|
+
Your parsing procedure may read from this IO object, and should return either a `Result`-like object with
|
117
|
+
the file metadata (if it could recover any) or `nil` if it couldn't. All files pass
|
118
|
+
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
119
|
+
your method or `break` your Proc as early as possible. A blank `return` works fine too as it actually returns `nil`.
|
98
120
|
|
99
|
-
Your parser
|
100
|
-
and file natures it provides.
|
121
|
+
Your parser then needs to be registered using `FormatParser.register_parser` with the information on the formats
|
122
|
+
and file natures it provides. This allows FormatParser to skip your parser if, say, the user only want to parse for
|
123
|
+
`:image` nature files but your parser parses `:audio`.
|
101
124
|
|
102
|
-
Down below you can find the most basic parser implementation:
|
125
|
+
Down below you can find the most basic parser implementation which parses an imaginary `IMGA` file format:
|
103
126
|
|
104
127
|
```ruby
|
105
128
|
MyParser = ->(io) {
|
106
|
-
# ...
|
129
|
+
# ... Read the magic bytes from the start of IO - the IO is
|
130
|
+
# guaranteed to be fed to you at offset 0, start-of-file.
|
107
131
|
magic_bytes = io.read(4)
|
132
|
+
|
108
133
|
# breaking the block returns `nil` to the caller signaling "no match"
|
109
134
|
break if magic_bytes != 'IMGA'
|
110
135
|
|
136
|
+
# Our file format stores the width and height as 2 32-bit unsigned integers
|
111
137
|
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
138
|
+
|
112
139
|
# ...and return the FileInformation::Image object with the metadata.
|
113
140
|
FormatParser::Image.new(
|
114
141
|
format: :imga,
|
@@ -135,8 +162,8 @@ class MyParser
|
|
135
162
|
# ... do some parsing with `io`
|
136
163
|
# The instance will be discarded after parsing, so using instance variables
|
137
164
|
# is permitted - they are not shared between calls to `call`
|
138
|
-
|
139
|
-
break if
|
165
|
+
magic_bytes = io.read(4)
|
166
|
+
break if magic_bytes != 'IMGA'
|
140
167
|
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
141
168
|
FormatParser::Image.new(
|
142
169
|
format: :imga,
|
@@ -145,23 +172,33 @@ class MyParser
|
|
145
172
|
)
|
146
173
|
end
|
147
174
|
|
148
|
-
|
175
|
+
# Note that we register an instance of the class, not the class. It is the
|
176
|
+
# instance that responds to `call()` and we can do this because our object
|
177
|
+
# is stateless.
|
178
|
+
FormatParser.register_parser new, natures: :image, formats: :bmp
|
149
179
|
end
|
150
180
|
```
|
151
181
|
|
152
|
-
|
182
|
+
If your parser supports file types which have a known filename extension, you can add a method to it called `likely_match?`,
|
183
|
+
add this method on the object you register itself. For example, for the ZIP parser we use:
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
def likely_match?(filename)
|
187
|
+
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
|
188
|
+
end
|
189
|
+
```
|
153
190
|
|
154
|
-
|
191
|
+
If your parser matches the filename it is going to be applied *earlier*, saving time. Since most FormatParser users are
|
192
|
+
likely to only want the first result of the parsing, the sooner your parser gets applied - the sooner you can return the result,
|
193
|
+
avoiding unnecessary reads.
|
155
194
|
|
156
|
-
|
157
|
-
2) An object that responds to `new` and returns something that can be `call()`-ed with with an argument that conforms to `IOConstraint`.
|
195
|
+
### Calling convention for preparing parsers
|
158
196
|
|
159
|
-
|
197
|
+
A parser that gets registered using `register_parser` must be an object that can be `call()`-ed, with an argument that conforms to `IOConstraint`
|
160
198
|
|
161
199
|
FormatParser is made to be used in threaded environments, and if you use instance variables
|
162
|
-
you need your parser to be isolated from it's siblings in other threads -
|
163
|
-
|
164
|
-
|
200
|
+
you need your parser to be isolated from it's siblings in other threads - create a copy for one-off use inside
|
201
|
+
your `call` method.
|
165
202
|
|
166
203
|
## Pull requests
|
167
204
|
|
data/lib/format_parser.rb
CHANGED
@@ -36,7 +36,7 @@ module FormatParser
|
|
36
36
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
37
37
|
# provides out of the box registers itself using this method.
|
38
38
|
#
|
39
|
-
# @param
|
39
|
+
# @param callable_parser[#call] an object that responds to #call for parsing an IO
|
40
40
|
# @param formats[Array<Symbol>] file formats that the parser provides
|
41
41
|
# @param natures[Array<Symbol>] file natures that the parser provides
|
42
42
|
# @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
|
@@ -45,39 +45,39 @@ module FormatParser
|
|
45
45
|
# with a lower priority value will be applied first, and if a single result is requested, will also return
|
46
46
|
# first.
|
47
47
|
# @return void
|
48
|
-
def self.register_parser(
|
48
|
+
def self.register_parser(callable_parser, formats:, natures:, priority: LEAST_PRIORITY)
|
49
49
|
parser_provided_formats = Array(formats)
|
50
50
|
parser_provided_natures = Array(natures)
|
51
51
|
PARSER_MUX.synchronize do
|
52
52
|
@parsers ||= Set.new
|
53
|
-
@parsers <<
|
53
|
+
@parsers << callable_parser
|
54
54
|
@parsers_per_nature ||= {}
|
55
55
|
parser_provided_natures.each do |provided_nature|
|
56
56
|
@parsers_per_nature[provided_nature] ||= Set.new
|
57
|
-
@parsers_per_nature[provided_nature] <<
|
57
|
+
@parsers_per_nature[provided_nature] << callable_parser
|
58
58
|
end
|
59
59
|
@parsers_per_format ||= {}
|
60
60
|
parser_provided_formats.each do |provided_format|
|
61
61
|
@parsers_per_format[provided_format] ||= Set.new
|
62
|
-
@parsers_per_format[provided_format] <<
|
62
|
+
@parsers_per_format[provided_format] << callable_parser
|
63
63
|
end
|
64
64
|
@parser_priorities ||= {}
|
65
|
-
@parser_priorities[
|
65
|
+
@parser_priorities[callable_parser] = priority
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
69
|
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
70
70
|
# tests, but can also be used to forcibly disable some formats completely.
|
71
71
|
#
|
72
|
-
# @param
|
72
|
+
# @param callable_parser[#==] an object that is identity-equal to any other registered parser
|
73
73
|
# @return void
|
74
|
-
def self.deregister_parser(
|
74
|
+
def self.deregister_parser(callable_parser)
|
75
75
|
# Used only in tests
|
76
76
|
PARSER_MUX.synchronize do
|
77
|
-
(@parsers || []).delete(
|
78
|
-
(@parsers_per_nature || {}).values.map { |e| e.delete(
|
79
|
-
(@parsers_per_format || {}).values.map { |e| e.delete(
|
80
|
-
(@parser_priorities || {}).delete(
|
77
|
+
(@parsers || []).delete(callable_parser)
|
78
|
+
(@parsers_per_nature || {}).values.map { |e| e.delete(callable_parser) }
|
79
|
+
(@parsers_per_format || {}).values.map { |e| e.delete(callable_parser) }
|
80
|
+
(@parser_priorities || {}).delete(callable_parser)
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -73,6 +73,8 @@ class FormatParser::MP3Parser
|
|
73
73
|
id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
|
74
74
|
tags = [id3v1, ID3Extraction.attempt_id3_v2_extraction(io)].compact
|
75
75
|
|
76
|
+
io.seek(0) if tags.empty?
|
77
|
+
|
76
78
|
# Compute how many bytes are occupied by the actual MPEG frames
|
77
79
|
ignore_bytes_at_tail = id3v1 ? 128 : 0
|
78
80
|
ignore_bytes_at_head = io.pos
|
@@ -166,6 +166,18 @@ describe FormatParser::MP3Parser do
|
|
166
166
|
expect(parsed.artist). to eq('wetransfer')
|
167
167
|
end
|
168
168
|
|
169
|
+
it 'does not skip the first bytes if it is not a id3 tag header' do
|
170
|
+
fpath = fixtures_dir + '/MP3/no_id3_tags.mp3'
|
171
|
+
|
172
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
173
|
+
|
174
|
+
expect(parsed).not_to be_nil
|
175
|
+
|
176
|
+
expect(parsed.nature).to eq(:audio)
|
177
|
+
expect(parsed.format).to eq(:mp3)
|
178
|
+
expect(parsed.audio_sample_rate_hz).to eq(44100)
|
179
|
+
end
|
180
|
+
|
169
181
|
describe '#as_json' do
|
170
182
|
it 'converts all hash keys to string when stringify_keys: true' do
|
171
183
|
fpath = fixtures_dir + '/MP3/Cassy.mp3'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.25.
|
4
|
+
version: 0.25.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-10-
|
12
|
+
date: 2020-10-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|