format_parser 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +47 -8
- data/format_parser.gemspec +9 -3
- data/lib/audio.rb +37 -0
- data/lib/document.rb +17 -0
- data/lib/format_parser.rb +41 -10
- data/lib/format_parser/version.rb +1 -1
- data/lib/{file_information.rb → image.rb} +5 -26
- data/lib/parsers/aiff_parser.rb +7 -4
- data/lib/parsers/dpx_parser.rb +8 -3
- data/lib/parsers/dsl.rb +29 -0
- data/lib/parsers/fdx_parser.rb +10 -7
- data/lib/parsers/gif_parser.rb +8 -5
- data/lib/parsers/jpeg_parser.rb +8 -12
- data/lib/parsers/moov_parser.rb +9 -7
- data/lib/parsers/mp3_parser.rb +7 -4
- data/lib/parsers/png_parser.rb +9 -5
- data/lib/parsers/psd_parser.rb +8 -5
- data/lib/parsers/tiff_parser.rb +9 -13
- data/lib/parsers/wav_parser.rb +6 -5
- data/lib/video.rb +33 -0
- data/spec/aiff_parser_spec.rb +6 -6
- data/spec/file_information_spec.rb +4 -4
- data/spec/format_parser_spec.rb +30 -2
- data/spec/parsers/dpx_parser_spec.rb +4 -4
- data/spec/parsers/fdx_parser_spec.rb +5 -5
- data/spec/parsers/gif_parser_spec.rb +4 -4
- data/spec/parsers/jpeg_parser_spec.rb +4 -4
- data/spec/parsers/moov_parser_spec.rb +9 -8
- data/spec/parsers/mp3_parser_spec.rb +9 -9
- data/spec/parsers/png_parser_spec.rb +4 -4
- data/spec/parsers/psd_parser_spec.rb +3 -3
- data/spec/parsers/tiff_parser_spec.rb +4 -4
- data/spec/parsers/wav_parser_spec.rb +13 -13
- data/spec/remote_fetching_spec.rb +9 -9
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aac884390c22699e4c3508ef5ecd363d2f1d2b5a
|
4
|
+
data.tar.gz: da10c68c25b0ba46929282a1bcd918e9bf4513b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74efa5fe85532815e9475e711af8cfc6a2a4478c1832479ec073f7fdecdf4ed2997a61d2c5e068fe6013cd8fb9fbea7c16eaf1bc109485dde7d469b4cec62342
|
7
|
+
data.tar.gz: 2a8da42f8d4f49dff0b7c76393d5c9385ef109ada91d539235d5afbef98689e5f99ca4a5b9a99843c109d2665ea1c4843192d3cf9c25d4ccb3117c98c25fb455
|
data/README.md
CHANGED
@@ -15,18 +15,57 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
15
15
|
|
16
16
|
## Basic usage
|
17
17
|
|
18
|
-
Pass an IO object that responds to `read` and `seek` to `FormatParser
|
18
|
+
Pass an IO object that responds to `read` and `seek` to `FormatParser` and an array of matches will be returned.
|
19
19
|
|
20
20
|
```ruby
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
matches = FormatParser.parse(File.open("myimage.jpg", "rb"))
|
22
|
+
matches.first.nature #=> :image
|
23
|
+
matches.first.format #=> :jpg
|
24
|
+
matches.first.width_px #=> 320
|
25
|
+
matches.first.height_px #=> 240
|
26
|
+
matches.first.orientation #=> :top_left
|
27
27
|
```
|
28
28
|
|
29
|
-
If
|
29
|
+
If you would rather receive only one result, call the gem as follows:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
FormatParser.parse(File.open("myimage.jpg", "rb"), returns: :one)
|
33
|
+
```
|
34
|
+
|
35
|
+
You can also optimize the metadata extraction by providing hints to the gem:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
FormatParser.parse(File.open("myimage", "rb"), natures: [:video, :image], formats: [:jpg, :png, :mp4])
|
39
|
+
```
|
40
|
+
|
41
|
+
## Creating your own parsers
|
42
|
+
|
43
|
+
In order to create new parsers, these have to meet two requirements:
|
44
|
+
|
45
|
+
1) Instances of the new parser class needs to respond to a `call` method which takes one IO object as an argument and returns some metadata information about its corresponding file or nil otherwise.
|
46
|
+
2) Instances of the new parser class needs to respond `natures` and `formats` accessor methods, both returning an array of symbols. A simple DSL is provided to avoid writing those accessors.
|
47
|
+
3) The class needs to register itself as a parser.
|
48
|
+
|
49
|
+
|
50
|
+
Down below you can find a basic parser implementation:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
class BasicParser
|
54
|
+
include FormatParser::DSL # Adds formats and natures methods to the class, which define
|
55
|
+
# accessor for all the instances.
|
56
|
+
|
57
|
+
formats :foo, :baz # Indicates which formats it can read.
|
58
|
+
natures :bar # Indicates which type of file from a human perspective it can read:
|
59
|
+
# - :audio
|
60
|
+
# - :document
|
61
|
+
# - :image
|
62
|
+
# - :video
|
63
|
+
def call(file)
|
64
|
+
# Returns a DTO object with including some metadata.
|
65
|
+
end
|
66
|
+
|
67
|
+
FormatParser.register_parser_constructor self # Register this parser.
|
68
|
+
```
|
30
69
|
|
31
70
|
## Design rationale
|
32
71
|
|
data/format_parser.gemspec
CHANGED
@@ -15,8 +15,14 @@ Gem::Specification.new do |spec|
|
|
15
15
|
minimum amount of data possible."
|
16
16
|
spec.homepage = "https://github.com/WeTransfer/format_parser"
|
17
17
|
spec.license = "MIT"
|
18
|
-
|
19
|
-
|
18
|
+
# Alert people to a change in the gem's interface, will remove in a subsequent version
|
19
|
+
spec.post_install_message = %q{
|
20
|
+
-----------------------------------------------------------------------------
|
21
|
+
| ATTENTION: format_parser v0.2.0 introduces changes to the gem's interface.|
|
22
|
+
| See https://github.com/WeTransfer/format_parser#basic-usage |
|
23
|
+
| for up-to-date usage instructions. Thank you for using format_parser! :) |
|
24
|
+
-----------------------------------------------------------------------------
|
25
|
+
}
|
20
26
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
21
27
|
if spec.respond_to?(:metadata)
|
22
28
|
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
@@ -35,7 +41,7 @@ Gem::Specification.new do |spec|
|
|
35
41
|
spec.add_dependency 'ks', '~> 0.0.1'
|
36
42
|
spec.add_dependency 'exifr', '~> 1.0'
|
37
43
|
spec.add_dependency 'faraday', '~> 0.13'
|
38
|
-
|
44
|
+
|
39
45
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
40
46
|
spec.add_development_dependency 'rake', '~> 12'
|
41
47
|
spec.add_development_dependency 'simplecov', '~> 0.15'
|
data/lib/audio.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module FormatParser
|
2
|
+
class Audio
|
3
|
+
NATURE = :audio
|
4
|
+
|
5
|
+
# Type of the file (e.g :mp3)
|
6
|
+
attr_accessor :format
|
7
|
+
|
8
|
+
# The number of audio channels for sound files that are muxed
|
9
|
+
# and for video files with embedded sound
|
10
|
+
attr_accessor :num_audio_channels
|
11
|
+
|
12
|
+
# SampeThe number of audio channels for sound files that are muxed
|
13
|
+
# and for video files with embedded sound
|
14
|
+
attr_accessor :audio_sample_rate_hz
|
15
|
+
|
16
|
+
# Duration of the media object (be it audio or video) in seconds,
|
17
|
+
# as a Float
|
18
|
+
attr_accessor :media_duration_seconds
|
19
|
+
|
20
|
+
# Duration of the media object in addressable frames or samples,
|
21
|
+
# as an Integer
|
22
|
+
attr_accessor :media_duration_frames
|
23
|
+
|
24
|
+
# If a parser wants to provide any extra information to the caller
|
25
|
+
# it can be placed here
|
26
|
+
attr_accessor :intrinsics
|
27
|
+
|
28
|
+
# Only permits assignments via defined accessors
|
29
|
+
def initialize(**attributes)
|
30
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def nature
|
34
|
+
NATURE
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/document.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module FormatParser
|
2
|
+
class Document
|
3
|
+
NATURE = :document
|
4
|
+
|
5
|
+
attr_accessor :format
|
6
|
+
attr_accessor :document_type
|
7
|
+
|
8
|
+
# Only permits assignments via defined accessors
|
9
|
+
def initialize(**attributes)
|
10
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def nature
|
14
|
+
NATURE
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/format_parser.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
require 'thread'
|
2
2
|
|
3
3
|
module FormatParser
|
4
|
-
require_relative '
|
4
|
+
require_relative 'image'
|
5
|
+
require_relative 'audio'
|
6
|
+
require_relative 'document'
|
7
|
+
require_relative 'video'
|
5
8
|
require_relative 'io_utils'
|
6
9
|
require_relative 'read_limiter'
|
7
10
|
require_relative 'remote_io'
|
8
11
|
require_relative 'io_constraint'
|
9
12
|
require_relative 'care'
|
13
|
+
require_relative 'parsers/dsl'
|
10
14
|
|
11
15
|
PARSER_MUX = Mutex.new
|
12
16
|
|
@@ -14,6 +18,13 @@ module FormatParser
|
|
14
18
|
PARSER_MUX.synchronize do
|
15
19
|
@parsers ||= []
|
16
20
|
@parsers << object_responding_to_new
|
21
|
+
# Gathering natures and formats from parsers. An instance has to be created.
|
22
|
+
parser = object_responding_to_new.new
|
23
|
+
@natures ||= Set.new
|
24
|
+
# NOTE: merge method for sets modify the instance.
|
25
|
+
@natures.merge(parser.natures)
|
26
|
+
@formats ||= Set.new
|
27
|
+
@formats.merge(parser.formats)
|
17
28
|
end
|
18
29
|
end
|
19
30
|
|
@@ -30,25 +41,32 @@ module FormatParser
|
|
30
41
|
parse(cached_io)
|
31
42
|
end
|
32
43
|
|
33
|
-
def self.parse(io)
|
44
|
+
def self.parse(io, natures: @natures.to_a, formats: @formats.to_a, returns: :all)
|
34
45
|
# If the cache is preconfigured do not apply an extra layer. It is going
|
35
46
|
# to be preconfigured when using parse_http.
|
36
47
|
io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
|
37
48
|
|
49
|
+
# How many results has the user asked for? Used to determinate whether an array
|
50
|
+
# is returned or not.
|
51
|
+
amount = case returns
|
52
|
+
when :all
|
53
|
+
@parsers.count
|
54
|
+
when :one
|
55
|
+
1
|
56
|
+
else
|
57
|
+
throw ArgumentError.new(":returns does not match any supported mode (:all, :one)")
|
58
|
+
end
|
59
|
+
|
38
60
|
# Always instantiate parsers fresh for each input, since they might
|
39
61
|
# contain instance variables which otherwise would have to be reset
|
40
62
|
# between invocations, and would complicate threading situations
|
41
|
-
|
42
|
-
|
43
|
-
parsers.each do |parser|
|
63
|
+
results = parsers_for(natures, formats).map do |parser|
|
44
64
|
# We need to rewind for each parser, anew
|
45
65
|
io.seek(0)
|
46
66
|
# Limit how many operations the parser can perform
|
47
67
|
limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
|
48
68
|
begin
|
49
|
-
|
50
|
-
return info
|
51
|
-
end
|
69
|
+
parser.call(limited_io)
|
52
70
|
rescue IOUtils::InvalidRead
|
53
71
|
# There was not enough data for this parser to work on,
|
54
72
|
# and it triggered an error
|
@@ -57,8 +75,21 @@ module FormatParser
|
|
57
75
|
# caused the parser to go off-track. Strictly speaking we should log this
|
58
76
|
# and examine the file more closely.
|
59
77
|
end
|
60
|
-
end
|
61
|
-
|
78
|
+
end.reject(&:nil?).take(amount)
|
79
|
+
|
80
|
+
return results.first if amount == 1
|
81
|
+
# Convert the results from a lazy enumerator to an array.
|
82
|
+
results.to_a
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def self.parsers_for(natures, formats)
|
88
|
+
# returns lazy enumerator for only computing the minimum amount of work (see :returns keyword argument)
|
89
|
+
@parsers.map(&:new).select do |parser|
|
90
|
+
# Do a given parser contain any nature and/or format asked by the user?
|
91
|
+
(natures & parser.natures).size > 0 && (formats & parser.formats).size > 0
|
92
|
+
end.lazy
|
62
93
|
end
|
63
94
|
|
64
95
|
Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
|
@@ -1,13 +1,11 @@
|
|
1
1
|
module FormatParser
|
2
|
-
class
|
3
|
-
|
4
|
-
# What kind of file is it?
|
5
|
-
attr_accessor :file_nature
|
2
|
+
class Image
|
3
|
+
NATURE = :image
|
6
4
|
|
7
5
|
# What filetype was recognized? Will contain a non-ambiguous symbol
|
8
6
|
# referring to the file format. The symbol can be used as a filename
|
9
7
|
# extension safely
|
10
|
-
attr_accessor :
|
8
|
+
attr_accessor :format
|
11
9
|
|
12
10
|
# Number of pixels horizontally in the pixel buffer
|
13
11
|
attr_accessor :width_px
|
@@ -42,25 +40,6 @@ module FormatParser
|
|
42
40
|
# http://magnushoff.com/jpeg-orientation.html
|
43
41
|
attr_accessor :image_orientation
|
44
42
|
|
45
|
-
# The number of audio channels for sound files that are muxed
|
46
|
-
# and for video files with embedded sound
|
47
|
-
attr_accessor :num_audio_channels
|
48
|
-
|
49
|
-
# SampeThe number of audio channels for sound files that are muxed
|
50
|
-
# and for video files with embedded sound
|
51
|
-
attr_accessor :audio_sample_rate_hz
|
52
|
-
|
53
|
-
# Duration of the media object (be it audio or video) in seconds,
|
54
|
-
# as a Float
|
55
|
-
attr_accessor :media_duration_seconds
|
56
|
-
|
57
|
-
# Duration of the media object in addressable frames or samples,
|
58
|
-
# as an Integer
|
59
|
-
attr_accessor :media_duration_frames
|
60
|
-
|
61
|
-
# XML Document Type
|
62
|
-
attr_accessor :document_type
|
63
|
-
|
64
43
|
# If a parser wants to provide any extra information to the caller
|
65
44
|
# it can be placed here
|
66
45
|
attr_accessor :intrinsics
|
@@ -70,8 +49,8 @@ module FormatParser
|
|
70
49
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
71
50
|
end
|
72
51
|
|
73
|
-
def
|
74
|
-
|
52
|
+
def nature
|
53
|
+
NATURE
|
75
54
|
end
|
76
55
|
end
|
77
56
|
end
|
data/lib/parsers/aiff_parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
class FormatParser::AIFFParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
3
4
|
|
4
5
|
# Known chunk types we can omit when parsing,
|
5
6
|
# grossly lifted from http://www.muratnkonar.com/aiff/
|
@@ -18,7 +19,10 @@ class FormatParser::AIFFParser
|
|
18
19
|
'ANNO',
|
19
20
|
]
|
20
21
|
|
21
|
-
|
22
|
+
natures :audio
|
23
|
+
formats :aiff
|
24
|
+
|
25
|
+
def call(io)
|
22
26
|
io = FormatParser::IOConstraint.new(io)
|
23
27
|
form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
|
24
28
|
return unless form_chunk_type == "FORM" && chunk_size > 4
|
@@ -62,9 +66,8 @@ class FormatParser::AIFFParser
|
|
62
66
|
duration_in_seconds = sample_frames / sample_rate
|
63
67
|
return unless duration_in_seconds > 0
|
64
68
|
|
65
|
-
FormatParser::
|
66
|
-
|
67
|
-
file_type: :aiff,
|
69
|
+
FormatParser::Audio.new(
|
70
|
+
format: :aiff,
|
68
71
|
num_audio_channels: channels,
|
69
72
|
audio_sample_rate_hz: sample_rate.to_i,
|
70
73
|
media_duration_frames: sample_frames,
|
data/lib/parsers/dpx_parser.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
class FormatParser::DPXParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
4
|
+
|
5
|
+
natures :image
|
6
|
+
formats :dpx
|
7
|
+
|
3
8
|
FILE_INFO = [
|
4
9
|
# :x4, # magic bytes SDPX, we read them anyway so not in the pattern
|
5
10
|
:x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
|
@@ -124,7 +129,7 @@ class FormatParser::DPXParser
|
|
124
129
|
LE_MAGIC = BE_MAGIC.reverse
|
125
130
|
HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
|
126
131
|
|
127
|
-
def
|
132
|
+
def call(io)
|
128
133
|
io = FormatParser::IOConstraint.new(io)
|
129
134
|
magic = io.read(4)
|
130
135
|
|
@@ -133,8 +138,8 @@ class FormatParser::DPXParser
|
|
133
138
|
unpack_pattern = DPX_INFO
|
134
139
|
unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
|
135
140
|
num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
|
136
|
-
FormatParser::
|
137
|
-
|
141
|
+
FormatParser::Image.new(
|
142
|
+
format: :dpx,
|
138
143
|
width_px: pixels_per_line,
|
139
144
|
height_px: num_lines,
|
140
145
|
)
|
data/lib/parsers/dsl.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module FormatParser
|
2
|
+
# Small DSL to avoid repetitive code while defining a new parsers. Also, it can be leveraged by
|
3
|
+
# third parties to define their own parsers.
|
4
|
+
module DSL
|
5
|
+
def self.included(base)
|
6
|
+
base.extend(ClassMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def formats(*registred_formats)
|
11
|
+
__define(:formats, registred_formats)
|
12
|
+
end
|
13
|
+
|
14
|
+
def natures(*registred_natures)
|
15
|
+
__define(:natures, registred_natures)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def __define(name, value)
|
21
|
+
throw ArgumentError("empty array") if value.empty?
|
22
|
+
throw ArgumentError("requires array of symbols") if value.any? { |s| !s.is_a?(Symbol) }
|
23
|
+
define_method(name) do
|
24
|
+
value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/parsers/fdx_parser.rb
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
class FormatParser::FDXParser
|
2
2
|
include FormatParser::IOUtils
|
3
|
+
include FormatParser::DSL
|
3
4
|
|
4
|
-
|
5
|
+
formats :fdx
|
6
|
+
natures :document
|
7
|
+
|
8
|
+
def call(io)
|
5
9
|
return if !xml_check(io)
|
6
10
|
file_and_document_type = safe_read(io, 100)
|
7
11
|
file_type, document_type = check_for_document_type(file_and_document_type)
|
8
12
|
return if file_type != :fdx
|
9
|
-
|
10
|
-
|
11
|
-
file_type: file_type,
|
13
|
+
FormatParser::Document.new(
|
14
|
+
format: file_type,
|
12
15
|
document_type: document_type
|
13
16
|
)
|
14
17
|
end
|
15
18
|
|
16
19
|
def xml_check(io)
|
17
20
|
xml_check = safe_read(io, 5)
|
18
|
-
xml_check == "<?xml"
|
21
|
+
xml_check == "<?xml"
|
19
22
|
end
|
20
|
-
|
23
|
+
|
21
24
|
def check_for_document_type(file_and_document_type)
|
22
25
|
sanitized_data = file_and_document_type.downcase
|
23
|
-
if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
|
26
|
+
if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
|
24
27
|
return :fdx, :script
|
25
28
|
else
|
26
29
|
return
|