format_parser 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 70fd9b84e2b397e862c2f6554eb6f3830f77b3c8
4
- data.tar.gz: 36c2a28a96f0bea5644f550bb616983d8320961d
3
+ metadata.gz: aac884390c22699e4c3508ef5ecd363d2f1d2b5a
4
+ data.tar.gz: da10c68c25b0ba46929282a1bcd918e9bf4513b1
5
5
  SHA512:
6
- metadata.gz: d2603890e62f99e5a4e63dd28f1860aeee86f1f47d0a690fa559f01a27aa106104fb71195427814ac346208435a3fd720db4458cdbd8cb9cf1291c517334ca8f
7
- data.tar.gz: 35f16d20d4a3e015d58b82297f66f2bd3df1e2ef78a0ced278c474bcec64e8d47241fc4e99368b1fa8d203bda182e509c0c58d65037539f9e7b6d5f7b98bfe0e
6
+ metadata.gz: 74efa5fe85532815e9475e711af8cfc6a2a4478c1832479ec073f7fdecdf4ed2997a61d2c5e068fe6013cd8fb9fbea7c16eaf1bc109485dde7d469b4cec62342
7
+ data.tar.gz: 2a8da42f8d4f49dff0b7c76393d5c9385ef109ada91d539235d5afbef98689e5f99ca4a5b9a99843c109d2665ea1c4843192d3cf9c25d4ccb3117c98c25fb455
data/README.md CHANGED
@@ -15,18 +15,57 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
15
15
 
16
16
  ## Basic usage
17
17
 
18
- Pass an IO object that responds to `read` and `seek` to `FormatParser`.
18
+ Pass an IO object that responds to `read` and `seek` to `FormatParser` and an array of matches will be returned.
19
19
 
20
20
  ```ruby
21
- file_info = FormatParser.parse(File.open("myimage.jpg", "rb"))
22
- file_info.file_nature #=> :image
23
- file_info.file_format #=> :JPG
24
- file_info.width_px #=> 320
25
- file_info.height_px #=> 240
26
- file_info.orientation #=> :top_left
21
+ matches = FormatParser.parse(File.open("myimage.jpg", "rb"))
22
+ matches.first.nature #=> :image
23
+ matches.first.format #=> :jpg
24
+ matches.first.width_px #=> 320
25
+ matches.first.height_px #=> 240
26
+ matches.first.orientation #=> :top_left
27
27
  ```
28
28
 
29
- If nothing is detected, the result will be `nil`.
29
+ If you would rather receive only one result, call the gem as follows:
30
+
31
+ ```ruby
32
+ FormatParser.parse(File.open("myimage.jpg", "rb"), returns: :one)
33
+ ```
34
+
35
+ You can also optimize the metadata extraction by providing hints to the gem:
36
+
37
+ ```ruby
38
+ FormatParser.parse(File.open("myimage", "rb"), natures: [:video, :image], formats: [:jpg, :png, :mp4])
39
+ ```
40
+
41
+ ## Creating your own parsers
42
+
43
+ In order to create new parsers, these have to meet two requirements:
44
+
45
+ 1) Instances of the new parser class needs to respond to a `call` method which takes one IO object as an argument and returns some metadata information about its corresponding file or nil otherwise.
46
+ 2) Instances of the new parser class needs to respond `natures` and `formats` accessor methods, both returning an array of symbols. A simple DSL is provided to avoid writing those accessors.
47
+ 3) The class needs to register itself as a parser.
48
+
49
+
50
+ Down below you can find a basic parser implementation:
51
+
52
+ ```ruby
53
+ class BasicParser
54
+ include FormatParser::DSL # Adds formats and natures methods to the class, which define
55
+ # accessor for all the instances.
56
+
57
+ formats :foo, :baz # Indicates which formats it can read.
58
+ natures :bar # Indicates which type of file from a human perspective it can read:
59
+ # - :audio
60
+ # - :document
61
+ # - :image
62
+ # - :video
63
+ def call(file)
64
+ # Returns a DTO object with including some metadata.
65
+ end
66
+
67
+ FormatParser.register_parser_constructor self # Register this parser.
68
+ ```
30
69
 
31
70
  ## Design rationale
32
71
 
@@ -15,8 +15,14 @@ Gem::Specification.new do |spec|
15
15
  minimum amount of data possible."
16
16
  spec.homepage = "https://github.com/WeTransfer/format_parser"
17
17
  spec.license = "MIT"
18
-
19
- # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # Alert people to a change in the gem's interface, will remove in a subsequent version
19
+ spec.post_install_message = %q{
20
+ -----------------------------------------------------------------------------
21
+ | ATTENTION: format_parser v0.2.0 introduces changes to the gem's interface.|
22
+ | See https://github.com/WeTransfer/format_parser#basic-usage |
23
+ | for up-to-date usage instructions. Thank you for using format_parser! :) |
24
+ -----------------------------------------------------------------------------
25
+ }
20
26
  # to allow pushing to a single host or delete this section to allow pushing to any host.
21
27
  if spec.respond_to?(:metadata)
22
28
  spec.metadata['allowed_push_host'] = "https://rubygems.org"
@@ -35,7 +41,7 @@ Gem::Specification.new do |spec|
35
41
  spec.add_dependency 'ks', '~> 0.0.1'
36
42
  spec.add_dependency 'exifr', '~> 1.0'
37
43
  spec.add_dependency 'faraday', '~> 0.13'
38
-
44
+
39
45
  spec.add_development_dependency 'rspec', '~> 3.0'
40
46
  spec.add_development_dependency 'rake', '~> 12'
41
47
  spec.add_development_dependency 'simplecov', '~> 0.15'
data/lib/audio.rb ADDED
@@ -0,0 +1,37 @@
1
+ module FormatParser
2
+ class Audio
3
+ NATURE = :audio
4
+
5
+ # Type of the file (e.g :mp3)
6
+ attr_accessor :format
7
+
8
+ # The number of audio channels for sound files that are muxed
9
+ # and for video files with embedded sound
10
+ attr_accessor :num_audio_channels
11
+
12
+ # SampeThe number of audio channels for sound files that are muxed
13
+ # and for video files with embedded sound
14
+ attr_accessor :audio_sample_rate_hz
15
+
16
+ # Duration of the media object (be it audio or video) in seconds,
17
+ # as a Float
18
+ attr_accessor :media_duration_seconds
19
+
20
+ # Duration of the media object in addressable frames or samples,
21
+ # as an Integer
22
+ attr_accessor :media_duration_frames
23
+
24
+ # If a parser wants to provide any extra information to the caller
25
+ # it can be placed here
26
+ attr_accessor :intrinsics
27
+
28
+ # Only permits assignments via defined accessors
29
+ def initialize(**attributes)
30
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
31
+ end
32
+
33
+ def nature
34
+ NATURE
35
+ end
36
+ end
37
+ end
data/lib/document.rb ADDED
@@ -0,0 +1,17 @@
1
+ module FormatParser
2
+ class Document
3
+ NATURE = :document
4
+
5
+ attr_accessor :format
6
+ attr_accessor :document_type
7
+
8
+ # Only permits assignments via defined accessors
9
+ def initialize(**attributes)
10
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
11
+ end
12
+
13
+ def nature
14
+ NATURE
15
+ end
16
+ end
17
+ end
data/lib/format_parser.rb CHANGED
@@ -1,12 +1,16 @@
1
1
  require 'thread'
2
2
 
3
3
  module FormatParser
4
- require_relative 'file_information'
4
+ require_relative 'image'
5
+ require_relative 'audio'
6
+ require_relative 'document'
7
+ require_relative 'video'
5
8
  require_relative 'io_utils'
6
9
  require_relative 'read_limiter'
7
10
  require_relative 'remote_io'
8
11
  require_relative 'io_constraint'
9
12
  require_relative 'care'
13
+ require_relative 'parsers/dsl'
10
14
 
11
15
  PARSER_MUX = Mutex.new
12
16
 
@@ -14,6 +18,13 @@ module FormatParser
14
18
  PARSER_MUX.synchronize do
15
19
  @parsers ||= []
16
20
  @parsers << object_responding_to_new
21
+ # Gathering natures and formats from parsers. An instance has to be created.
22
+ parser = object_responding_to_new.new
23
+ @natures ||= Set.new
24
+ # NOTE: merge method for sets modify the instance.
25
+ @natures.merge(parser.natures)
26
+ @formats ||= Set.new
27
+ @formats.merge(parser.formats)
17
28
  end
18
29
  end
19
30
 
@@ -30,25 +41,32 @@ module FormatParser
30
41
  parse(cached_io)
31
42
  end
32
43
 
33
- def self.parse(io)
44
+ def self.parse(io, natures: @natures.to_a, formats: @formats.to_a, returns: :all)
34
45
  # If the cache is preconfigured do not apply an extra layer. It is going
35
46
  # to be preconfigured when using parse_http.
36
47
  io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
37
48
 
49
+ # How many results has the user asked for? Used to determinate whether an array
50
+ # is returned or not.
51
+ amount = case returns
52
+ when :all
53
+ @parsers.count
54
+ when :one
55
+ 1
56
+ else
57
+ throw ArgumentError.new(":returns does not match any supported mode (:all, :one)")
58
+ end
59
+
38
60
  # Always instantiate parsers fresh for each input, since they might
39
61
  # contain instance variables which otherwise would have to be reset
40
62
  # between invocations, and would complicate threading situations
41
- parsers = @parsers.map(&:new)
42
-
43
- parsers.each do |parser|
63
+ results = parsers_for(natures, formats).map do |parser|
44
64
  # We need to rewind for each parser, anew
45
65
  io.seek(0)
46
66
  # Limit how many operations the parser can perform
47
67
  limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
48
68
  begin
49
- if info = parser.information_from_io(limited_io)
50
- return info
51
- end
69
+ parser.call(limited_io)
52
70
  rescue IOUtils::InvalidRead
53
71
  # There was not enough data for this parser to work on,
54
72
  # and it triggered an error
@@ -57,8 +75,21 @@ module FormatParser
57
75
  # caused the parser to go off-track. Strictly speaking we should log this
58
76
  # and examine the file more closely.
59
77
  end
60
- end
61
- nil # Nothing matched
78
+ end.reject(&:nil?).take(amount)
79
+
80
+ return results.first if amount == 1
81
+ # Convert the results from a lazy enumerator to an array.
82
+ results.to_a
83
+ end
84
+
85
+ private
86
+
87
+ def self.parsers_for(natures, formats)
88
+ # returns lazy enumerator for only computing the minimum amount of work (see :returns keyword argument)
89
+ @parsers.map(&:new).select do |parser|
90
+ # Do a given parser contain any nature and/or format asked by the user?
91
+ (natures & parser.natures).size > 0 && (formats & parser.formats).size > 0
92
+ end.lazy
62
93
  end
63
94
 
64
95
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.1.7'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -1,13 +1,11 @@
1
1
  module FormatParser
2
- class FileInformation
3
-
4
- # What kind of file is it?
5
- attr_accessor :file_nature
2
+ class Image
3
+ NATURE = :image
6
4
 
7
5
  # What filetype was recognized? Will contain a non-ambiguous symbol
8
6
  # referring to the file format. The symbol can be used as a filename
9
7
  # extension safely
10
- attr_accessor :file_type
8
+ attr_accessor :format
11
9
 
12
10
  # Number of pixels horizontally in the pixel buffer
13
11
  attr_accessor :width_px
@@ -42,25 +40,6 @@ module FormatParser
42
40
  # http://magnushoff.com/jpeg-orientation.html
43
41
  attr_accessor :image_orientation
44
42
 
45
- # The number of audio channels for sound files that are muxed
46
- # and for video files with embedded sound
47
- attr_accessor :num_audio_channels
48
-
49
- # SampeThe number of audio channels for sound files that are muxed
50
- # and for video files with embedded sound
51
- attr_accessor :audio_sample_rate_hz
52
-
53
- # Duration of the media object (be it audio or video) in seconds,
54
- # as a Float
55
- attr_accessor :media_duration_seconds
56
-
57
- # Duration of the media object in addressable frames or samples,
58
- # as an Integer
59
- attr_accessor :media_duration_frames
60
-
61
- # XML Document Type
62
- attr_accessor :document_type
63
-
64
43
  # If a parser wants to provide any extra information to the caller
65
44
  # it can be placed here
66
45
  attr_accessor :intrinsics
@@ -70,8 +49,8 @@ module FormatParser
70
49
  attributes.map { |(k, v)| public_send("#{k}=", v) }
71
50
  end
72
51
 
73
- def self.image(**kwargs)
74
- new(file_nature: :image, **kwargs)
52
+ def nature
53
+ NATURE
75
54
  end
76
55
  end
77
56
  end
@@ -1,5 +1,6 @@
1
1
  class FormatParser::AIFFParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
3
4
 
4
5
  # Known chunk types we can omit when parsing,
5
6
  # grossly lifted from http://www.muratnkonar.com/aiff/
@@ -18,7 +19,10 @@ class FormatParser::AIFFParser
18
19
  'ANNO',
19
20
  ]
20
21
 
21
- def information_from_io(io)
22
+ natures :audio
23
+ formats :aiff
24
+
25
+ def call(io)
22
26
  io = FormatParser::IOConstraint.new(io)
23
27
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
28
  return unless form_chunk_type == "FORM" && chunk_size > 4
@@ -62,9 +66,8 @@ class FormatParser::AIFFParser
62
66
  duration_in_seconds = sample_frames / sample_rate
63
67
  return unless duration_in_seconds > 0
64
68
 
65
- FormatParser::FileInformation.new(
66
- file_nature: :audio,
67
- file_type: :aiff,
69
+ FormatParser::Audio.new(
70
+ format: :aiff,
68
71
  num_audio_channels: channels,
69
72
  audio_sample_rate_hz: sample_rate.to_i,
70
73
  media_duration_frames: sample_frames,
@@ -1,5 +1,10 @@
1
1
  class FormatParser::DPXParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
4
+
5
+ natures :image
6
+ formats :dpx
7
+
3
8
  FILE_INFO = [
4
9
  # :x4, # magic bytes SDPX, we read them anyway so not in the pattern
5
10
  :x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
@@ -124,7 +129,7 @@ class FormatParser::DPXParser
124
129
  LE_MAGIC = BE_MAGIC.reverse
125
130
  HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
131
 
127
- def information_from_io(io)
132
+ def call(io)
128
133
  io = FormatParser::IOConstraint.new(io)
129
134
  magic = io.read(4)
130
135
 
@@ -133,8 +138,8 @@ class FormatParser::DPXParser
133
138
  unpack_pattern = DPX_INFO
134
139
  unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
135
140
  num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
136
- FormatParser::FileInformation.image(
137
- file_type: :dpx,
141
+ FormatParser::Image.new(
142
+ format: :dpx,
138
143
  width_px: pixels_per_line,
139
144
  height_px: num_lines,
140
145
  )
@@ -0,0 +1,29 @@
1
+ module FormatParser
2
+ # Small DSL to avoid repetitive code while defining a new parsers. Also, it can be leveraged by
3
+ # third parties to define their own parsers.
4
+ module DSL
5
+ def self.included(base)
6
+ base.extend(ClassMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+ def formats(*registred_formats)
11
+ __define(:formats, registred_formats)
12
+ end
13
+
14
+ def natures(*registred_natures)
15
+ __define(:natures, registred_natures)
16
+ end
17
+
18
+ private
19
+
20
+ def __define(name, value)
21
+ throw ArgumentError("empty array") if value.empty?
22
+ throw ArgumentError("requires array of symbols") if value.any? { |s| !s.is_a?(Symbol) }
23
+ define_method(name) do
24
+ value
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,26 +1,29 @@
1
1
  class FormatParser::FDXParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
3
4
 
4
- def information_from_io(io)
5
+ formats :fdx
6
+ natures :document
7
+
8
+ def call(io)
5
9
  return if !xml_check(io)
6
10
  file_and_document_type = safe_read(io, 100)
7
11
  file_type, document_type = check_for_document_type(file_and_document_type)
8
12
  return if file_type != :fdx
9
- file_info = FormatParser::FileInformation.new(
10
- file_nature: :document,
11
- file_type: file_type,
13
+ FormatParser::Document.new(
14
+ format: file_type,
12
15
  document_type: document_type
13
16
  )
14
17
  end
15
18
 
16
19
  def xml_check(io)
17
20
  xml_check = safe_read(io, 5)
18
- xml_check == "<?xml" ? true : false
21
+ xml_check == "<?xml"
19
22
  end
20
-
23
+
21
24
  def check_for_document_type(file_and_document_type)
22
25
  sanitized_data = file_and_document_type.downcase
23
- if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
26
+ if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
24
27
  return :fdx, :script
25
28
  else
26
29
  return