format_parser 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -8
- data/format_parser.gemspec +9 -3
- data/lib/audio.rb +37 -0
- data/lib/document.rb +17 -0
- data/lib/format_parser.rb +41 -10
- data/lib/format_parser/version.rb +1 -1
- data/lib/{file_information.rb → image.rb} +5 -26
- data/lib/parsers/aiff_parser.rb +7 -4
- data/lib/parsers/dpx_parser.rb +8 -3
- data/lib/parsers/dsl.rb +29 -0
- data/lib/parsers/fdx_parser.rb +10 -7
- data/lib/parsers/gif_parser.rb +8 -5
- data/lib/parsers/jpeg_parser.rb +8 -12
- data/lib/parsers/moov_parser.rb +9 -7
- data/lib/parsers/mp3_parser.rb +7 -4
- data/lib/parsers/png_parser.rb +9 -5
- data/lib/parsers/psd_parser.rb +8 -5
- data/lib/parsers/tiff_parser.rb +9 -13
- data/lib/parsers/wav_parser.rb +6 -5
- data/lib/video.rb +33 -0
- data/spec/aiff_parser_spec.rb +6 -6
- data/spec/file_information_spec.rb +4 -4
- data/spec/format_parser_spec.rb +30 -2
- data/spec/parsers/dpx_parser_spec.rb +4 -4
- data/spec/parsers/fdx_parser_spec.rb +5 -5
- data/spec/parsers/gif_parser_spec.rb +4 -4
- data/spec/parsers/jpeg_parser_spec.rb +4 -4
- data/spec/parsers/moov_parser_spec.rb +9 -8
- data/spec/parsers/mp3_parser_spec.rb +9 -9
- data/spec/parsers/png_parser_spec.rb +4 -4
- data/spec/parsers/psd_parser_spec.rb +3 -3
- data/spec/parsers/tiff_parser_spec.rb +4 -4
- data/spec/parsers/wav_parser_spec.rb +13 -13
- data/spec/remote_fetching_spec.rb +9 -9
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aac884390c22699e4c3508ef5ecd363d2f1d2b5a
|
4
|
+
data.tar.gz: da10c68c25b0ba46929282a1bcd918e9bf4513b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74efa5fe85532815e9475e711af8cfc6a2a4478c1832479ec073f7fdecdf4ed2997a61d2c5e068fe6013cd8fb9fbea7c16eaf1bc109485dde7d469b4cec62342
|
7
|
+
data.tar.gz: 2a8da42f8d4f49dff0b7c76393d5c9385ef109ada91d539235d5afbef98689e5f99ca4a5b9a99843c109d2665ea1c4843192d3cf9c25d4ccb3117c98c25fb455
|
data/README.md
CHANGED
@@ -15,18 +15,57 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
15
15
|
|
16
16
|
## Basic usage
|
17
17
|
|
18
|
-
Pass an IO object that responds to `read` and `seek` to `FormatParser
|
18
|
+
Pass an IO object that responds to `read` and `seek` to `FormatParser` and an array of matches will be returned.
|
19
19
|
|
20
20
|
```ruby
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
matches = FormatParser.parse(File.open("myimage.jpg", "rb"))
|
22
|
+
matches.first.nature #=> :image
|
23
|
+
matches.first.format #=> :jpg
|
24
|
+
matches.first.width_px #=> 320
|
25
|
+
matches.first.height_px #=> 240
|
26
|
+
matches.first.orientation #=> :top_left
|
27
27
|
```
|
28
28
|
|
29
|
-
If
|
29
|
+
If you would rather receive only one result, call the gem as follows:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
FormatParser.parse(File.open("myimage.jpg", "rb"), returns: :one)
|
33
|
+
```
|
34
|
+
|
35
|
+
You can also optimize the metadata extraction by providing hints to the gem:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
FormatParser.parse(File.open("myimage", "rb"), natures: [:video, :image], formats: [:jpg, :png, :mp4])
|
39
|
+
```
|
40
|
+
|
41
|
+
## Creating your own parsers
|
42
|
+
|
43
|
+
In order to create new parsers, these have to meet two requirements:
|
44
|
+
|
45
|
+
1) Instances of the new parser class needs to respond to a `call` method which takes one IO object as an argument and returns some metadata information about its corresponding file or nil otherwise.
|
46
|
+
2) Instances of the new parser class needs to respond `natures` and `formats` accessor methods, both returning an array of symbols. A simple DSL is provided to avoid writing those accessors.
|
47
|
+
3) The class needs to register itself as a parser.
|
48
|
+
|
49
|
+
|
50
|
+
Down below you can find a basic parser implementation:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
class BasicParser
|
54
|
+
include FormatParser::DSL # Adds formats and natures methods to the class, which define
|
55
|
+
# accessor for all the instances.
|
56
|
+
|
57
|
+
formats :foo, :baz # Indicates which formats it can read.
|
58
|
+
natures :bar # Indicates which type of file from a human perspective it can read:
|
59
|
+
# - :audio
|
60
|
+
# - :document
|
61
|
+
# - :image
|
62
|
+
# - :video
|
63
|
+
def call(file)
|
64
|
+
# Returns a DTO object with including some metadata.
|
65
|
+
end
|
66
|
+
|
67
|
+
FormatParser.register_parser_constructor self # Register this parser.
|
68
|
+
```
|
30
69
|
|
31
70
|
## Design rationale
|
32
71
|
|
data/format_parser.gemspec
CHANGED
@@ -15,8 +15,14 @@ Gem::Specification.new do |spec|
|
|
15
15
|
minimum amount of data possible."
|
16
16
|
spec.homepage = "https://github.com/WeTransfer/format_parser"
|
17
17
|
spec.license = "MIT"
|
18
|
-
|
19
|
-
|
18
|
+
# Alert people to a change in the gem's interface, will remove in a subsequent version
|
19
|
+
spec.post_install_message = %q{
|
20
|
+
-----------------------------------------------------------------------------
|
21
|
+
| ATTENTION: format_parser v0.2.0 introduces changes to the gem's interface.|
|
22
|
+
| See https://github.com/WeTransfer/format_parser#basic-usage |
|
23
|
+
| for up-to-date usage instructions. Thank you for using format_parser! :) |
|
24
|
+
-----------------------------------------------------------------------------
|
25
|
+
}
|
20
26
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
21
27
|
if spec.respond_to?(:metadata)
|
22
28
|
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
@@ -35,7 +41,7 @@ Gem::Specification.new do |spec|
|
|
35
41
|
spec.add_dependency 'ks', '~> 0.0.1'
|
36
42
|
spec.add_dependency 'exifr', '~> 1.0'
|
37
43
|
spec.add_dependency 'faraday', '~> 0.13'
|
38
|
-
|
44
|
+
|
39
45
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
40
46
|
spec.add_development_dependency 'rake', '~> 12'
|
41
47
|
spec.add_development_dependency 'simplecov', '~> 0.15'
|
data/lib/audio.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module FormatParser
|
2
|
+
class Audio
|
3
|
+
NATURE = :audio
|
4
|
+
|
5
|
+
# Type of the file (e.g :mp3)
|
6
|
+
attr_accessor :format
|
7
|
+
|
8
|
+
# The number of audio channels for sound files that are muxed
|
9
|
+
# and for video files with embedded sound
|
10
|
+
attr_accessor :num_audio_channels
|
11
|
+
|
12
|
+
# SampeThe number of audio channels for sound files that are muxed
|
13
|
+
# and for video files with embedded sound
|
14
|
+
attr_accessor :audio_sample_rate_hz
|
15
|
+
|
16
|
+
# Duration of the media object (be it audio or video) in seconds,
|
17
|
+
# as a Float
|
18
|
+
attr_accessor :media_duration_seconds
|
19
|
+
|
20
|
+
# Duration of the media object in addressable frames or samples,
|
21
|
+
# as an Integer
|
22
|
+
attr_accessor :media_duration_frames
|
23
|
+
|
24
|
+
# If a parser wants to provide any extra information to the caller
|
25
|
+
# it can be placed here
|
26
|
+
attr_accessor :intrinsics
|
27
|
+
|
28
|
+
# Only permits assignments via defined accessors
|
29
|
+
def initialize(**attributes)
|
30
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def nature
|
34
|
+
NATURE
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/document.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module FormatParser
|
2
|
+
class Document
|
3
|
+
NATURE = :document
|
4
|
+
|
5
|
+
attr_accessor :format
|
6
|
+
attr_accessor :document_type
|
7
|
+
|
8
|
+
# Only permits assignments via defined accessors
|
9
|
+
def initialize(**attributes)
|
10
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def nature
|
14
|
+
NATURE
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/format_parser.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
require 'thread'
|
2
2
|
|
3
3
|
module FormatParser
|
4
|
-
require_relative '
|
4
|
+
require_relative 'image'
|
5
|
+
require_relative 'audio'
|
6
|
+
require_relative 'document'
|
7
|
+
require_relative 'video'
|
5
8
|
require_relative 'io_utils'
|
6
9
|
require_relative 'read_limiter'
|
7
10
|
require_relative 'remote_io'
|
8
11
|
require_relative 'io_constraint'
|
9
12
|
require_relative 'care'
|
13
|
+
require_relative 'parsers/dsl'
|
10
14
|
|
11
15
|
PARSER_MUX = Mutex.new
|
12
16
|
|
@@ -14,6 +18,13 @@ module FormatParser
|
|
14
18
|
PARSER_MUX.synchronize do
|
15
19
|
@parsers ||= []
|
16
20
|
@parsers << object_responding_to_new
|
21
|
+
# Gathering natures and formats from parsers. An instance has to be created.
|
22
|
+
parser = object_responding_to_new.new
|
23
|
+
@natures ||= Set.new
|
24
|
+
# NOTE: merge method for sets modify the instance.
|
25
|
+
@natures.merge(parser.natures)
|
26
|
+
@formats ||= Set.new
|
27
|
+
@formats.merge(parser.formats)
|
17
28
|
end
|
18
29
|
end
|
19
30
|
|
@@ -30,25 +41,32 @@ module FormatParser
|
|
30
41
|
parse(cached_io)
|
31
42
|
end
|
32
43
|
|
33
|
-
def self.parse(io)
|
44
|
+
def self.parse(io, natures: @natures.to_a, formats: @formats.to_a, returns: :all)
|
34
45
|
# If the cache is preconfigured do not apply an extra layer. It is going
|
35
46
|
# to be preconfigured when using parse_http.
|
36
47
|
io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
|
37
48
|
|
49
|
+
# How many results has the user asked for? Used to determinate whether an array
|
50
|
+
# is returned or not.
|
51
|
+
amount = case returns
|
52
|
+
when :all
|
53
|
+
@parsers.count
|
54
|
+
when :one
|
55
|
+
1
|
56
|
+
else
|
57
|
+
throw ArgumentError.new(":returns does not match any supported mode (:all, :one)")
|
58
|
+
end
|
59
|
+
|
38
60
|
# Always instantiate parsers fresh for each input, since they might
|
39
61
|
# contain instance variables which otherwise would have to be reset
|
40
62
|
# between invocations, and would complicate threading situations
|
41
|
-
|
42
|
-
|
43
|
-
parsers.each do |parser|
|
63
|
+
results = parsers_for(natures, formats).map do |parser|
|
44
64
|
# We need to rewind for each parser, anew
|
45
65
|
io.seek(0)
|
46
66
|
# Limit how many operations the parser can perform
|
47
67
|
limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
|
48
68
|
begin
|
49
|
-
|
50
|
-
return info
|
51
|
-
end
|
69
|
+
parser.call(limited_io)
|
52
70
|
rescue IOUtils::InvalidRead
|
53
71
|
# There was not enough data for this parser to work on,
|
54
72
|
# and it triggered an error
|
@@ -57,8 +75,21 @@ module FormatParser
|
|
57
75
|
# caused the parser to go off-track. Strictly speaking we should log this
|
58
76
|
# and examine the file more closely.
|
59
77
|
end
|
60
|
-
end
|
61
|
-
|
78
|
+
end.reject(&:nil?).take(amount)
|
79
|
+
|
80
|
+
return results.first if amount == 1
|
81
|
+
# Convert the results from a lazy enumerator to an array.
|
82
|
+
results.to_a
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def self.parsers_for(natures, formats)
|
88
|
+
# returns lazy enumerator for only computing the minimum amount of work (see :returns keyword argument)
|
89
|
+
@parsers.map(&:new).select do |parser|
|
90
|
+
# Do a given parser contain any nature and/or format asked by the user?
|
91
|
+
(natures & parser.natures).size > 0 && (formats & parser.formats).size > 0
|
92
|
+
end.lazy
|
62
93
|
end
|
63
94
|
|
64
95
|
Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
|
@@ -1,13 +1,11 @@
|
|
1
1
|
module FormatParser
|
2
|
-
class
|
3
|
-
|
4
|
-
# What kind of file is it?
|
5
|
-
attr_accessor :file_nature
|
2
|
+
class Image
|
3
|
+
NATURE = :image
|
6
4
|
|
7
5
|
# What filetype was recognized? Will contain a non-ambiguous symbol
|
8
6
|
# referring to the file format. The symbol can be used as a filename
|
9
7
|
# extension safely
|
10
|
-
attr_accessor :
|
8
|
+
attr_accessor :format
|
11
9
|
|
12
10
|
# Number of pixels horizontally in the pixel buffer
|
13
11
|
attr_accessor :width_px
|
@@ -42,25 +40,6 @@ module FormatParser
|
|
42
40
|
# http://magnushoff.com/jpeg-orientation.html
|
43
41
|
attr_accessor :image_orientation
|
44
42
|
|
45
|
-
# The number of audio channels for sound files that are muxed
|
46
|
-
# and for video files with embedded sound
|
47
|
-
attr_accessor :num_audio_channels
|
48
|
-
|
49
|
-
# SampeThe number of audio channels for sound files that are muxed
|
50
|
-
# and for video files with embedded sound
|
51
|
-
attr_accessor :audio_sample_rate_hz
|
52
|
-
|
53
|
-
# Duration of the media object (be it audio or video) in seconds,
|
54
|
-
# as a Float
|
55
|
-
attr_accessor :media_duration_seconds
|
56
|
-
|
57
|
-
# Duration of the media object in addressable frames or samples,
|
58
|
-
# as an Integer
|
59
|
-
attr_accessor :media_duration_frames
|
60
|
-
|
61
|
-
# XML Document Type
|
62
|
-
attr_accessor :document_type
|
63
|
-
|
64
43
|
# If a parser wants to provide any extra information to the caller
|
65
44
|
# it can be placed here
|
66
45
|
attr_accessor :intrinsics
|
@@ -70,8 +49,8 @@ module FormatParser
|
|
70
49
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
71
50
|
end
|
72
51
|
|
73
|
-
def
|
74
|
-
|
52
|
+
def nature
|
53
|
+
NATURE
|
75
54
|
end
|
76
55
|
end
|
77
56
|
end
|
data/lib/parsers/aiff_parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
class FormatParser::AIFFParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
3
4
|
|
4
5
|
# Known chunk types we can omit when parsing,
|
5
6
|
# grossly lifted from http://www.muratnkonar.com/aiff/
|
@@ -18,7 +19,10 @@ class FormatParser::AIFFParser
|
|
18
19
|
'ANNO',
|
19
20
|
]
|
20
21
|
|
21
|
-
|
22
|
+
natures :audio
|
23
|
+
formats :aiff
|
24
|
+
|
25
|
+
def call(io)
|
22
26
|
io = FormatParser::IOConstraint.new(io)
|
23
27
|
form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
|
24
28
|
return unless form_chunk_type == "FORM" && chunk_size > 4
|
@@ -62,9 +66,8 @@ class FormatParser::AIFFParser
|
|
62
66
|
duration_in_seconds = sample_frames / sample_rate
|
63
67
|
return unless duration_in_seconds > 0
|
64
68
|
|
65
|
-
FormatParser::
|
66
|
-
|
67
|
-
file_type: :aiff,
|
69
|
+
FormatParser::Audio.new(
|
70
|
+
format: :aiff,
|
68
71
|
num_audio_channels: channels,
|
69
72
|
audio_sample_rate_hz: sample_rate.to_i,
|
70
73
|
media_duration_frames: sample_frames,
|
data/lib/parsers/dpx_parser.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
class FormatParser::DPXParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
4
|
+
|
5
|
+
natures :image
|
6
|
+
formats :dpx
|
7
|
+
|
3
8
|
FILE_INFO = [
|
4
9
|
# :x4, # magic bytes SDPX, we read them anyway so not in the pattern
|
5
10
|
:x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
|
@@ -124,7 +129,7 @@ class FormatParser::DPXParser
|
|
124
129
|
LE_MAGIC = BE_MAGIC.reverse
|
125
130
|
HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
|
126
131
|
|
127
|
-
def
|
132
|
+
def call(io)
|
128
133
|
io = FormatParser::IOConstraint.new(io)
|
129
134
|
magic = io.read(4)
|
130
135
|
|
@@ -133,8 +138,8 @@ class FormatParser::DPXParser
|
|
133
138
|
unpack_pattern = DPX_INFO
|
134
139
|
unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
|
135
140
|
num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
|
136
|
-
FormatParser::
|
137
|
-
|
141
|
+
FormatParser::Image.new(
|
142
|
+
format: :dpx,
|
138
143
|
width_px: pixels_per_line,
|
139
144
|
height_px: num_lines,
|
140
145
|
)
|
data/lib/parsers/dsl.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module FormatParser
|
2
|
+
# Small DSL to avoid repetitive code while defining a new parsers. Also, it can be leveraged by
|
3
|
+
# third parties to define their own parsers.
|
4
|
+
module DSL
|
5
|
+
def self.included(base)
|
6
|
+
base.extend(ClassMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def formats(*registred_formats)
|
11
|
+
__define(:formats, registred_formats)
|
12
|
+
end
|
13
|
+
|
14
|
+
def natures(*registred_natures)
|
15
|
+
__define(:natures, registred_natures)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def __define(name, value)
|
21
|
+
throw ArgumentError("empty array") if value.empty?
|
22
|
+
throw ArgumentError("requires array of symbols") if value.any? { |s| !s.is_a?(Symbol) }
|
23
|
+
define_method(name) do
|
24
|
+
value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/parsers/fdx_parser.rb
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
class FormatParser::FDXParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
3
4
|
|
4
|
-
|
5
|
+
formats :fdx
|
6
|
+
natures :document
|
7
|
+
|
8
|
+
def call(io)
|
5
9
|
return if !xml_check(io)
|
6
10
|
file_and_document_type = safe_read(io, 100)
|
7
11
|
file_type, document_type = check_for_document_type(file_and_document_type)
|
8
12
|
return if file_type != :fdx
|
9
|
-
|
10
|
-
|
11
|
-
file_type: file_type,
|
13
|
+
FormatParser::Document.new(
|
14
|
+
format: file_type,
|
12
15
|
document_type: document_type
|
13
16
|
)
|
14
17
|
end
|
15
18
|
|
16
19
|
def xml_check(io)
|
17
20
|
xml_check = safe_read(io, 5)
|
18
|
-
xml_check == "<?xml"
|
21
|
+
xml_check == "<?xml"
|
19
22
|
end
|
20
|
-
|
23
|
+
|
21
24
|
def check_for_document_type(file_and_document_type)
|
22
25
|
sanitized_data = file_and_document_type.downcase
|
23
|
-
if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
|
26
|
+
if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
|
24
27
|
return :fdx, :script
|
25
28
|
else
|
26
29
|
return
|