format_parser 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 70fd9b84e2b397e862c2f6554eb6f3830f77b3c8
4
- data.tar.gz: 36c2a28a96f0bea5644f550bb616983d8320961d
3
+ metadata.gz: aac884390c22699e4c3508ef5ecd363d2f1d2b5a
4
+ data.tar.gz: da10c68c25b0ba46929282a1bcd918e9bf4513b1
5
5
  SHA512:
6
- metadata.gz: d2603890e62f99e5a4e63dd28f1860aeee86f1f47d0a690fa559f01a27aa106104fb71195427814ac346208435a3fd720db4458cdbd8cb9cf1291c517334ca8f
7
- data.tar.gz: 35f16d20d4a3e015d58b82297f66f2bd3df1e2ef78a0ced278c474bcec64e8d47241fc4e99368b1fa8d203bda182e509c0c58d65037539f9e7b6d5f7b98bfe0e
6
+ metadata.gz: 74efa5fe85532815e9475e711af8cfc6a2a4478c1832479ec073f7fdecdf4ed2997a61d2c5e068fe6013cd8fb9fbea7c16eaf1bc109485dde7d469b4cec62342
7
+ data.tar.gz: 2a8da42f8d4f49dff0b7c76393d5c9385ef109ada91d539235d5afbef98689e5f99ca4a5b9a99843c109d2665ea1c4843192d3cf9c25d4ccb3117c98c25fb455
data/README.md CHANGED
@@ -15,18 +15,57 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
15
15
 
16
16
  ## Basic usage
17
17
 
18
- Pass an IO object that responds to `read` and `seek` to `FormatParser`.
18
+ Pass an IO object that responds to `read` and `seek` to `FormatParser` and an array of matches will be returned.
19
19
 
20
20
  ```ruby
21
- file_info = FormatParser.parse(File.open("myimage.jpg", "rb"))
22
- file_info.file_nature #=> :image
23
- file_info.file_format #=> :JPG
24
- file_info.width_px #=> 320
25
- file_info.height_px #=> 240
26
- file_info.orientation #=> :top_left
21
+ matches = FormatParser.parse(File.open("myimage.jpg", "rb"))
22
+ matches.first.nature #=> :image
23
+ matches.first.format #=> :jpg
24
+ matches.first.width_px #=> 320
25
+ matches.first.height_px #=> 240
26
+ matches.first.orientation #=> :top_left
27
27
  ```
28
28
 
29
- If nothing is detected, the result will be `nil`.
29
+ If you would rather receive only one result, call the gem as follows:
30
+
31
+ ```ruby
32
+ FormatParser.parse(File.open("myimage.jpg", "rb"), returns: :one)
33
+ ```
34
+
35
+ You can also optimize the metadata extraction by providing hints to the gem:
36
+
37
+ ```ruby
38
+ FormatParser.parse(File.open("myimage", "rb"), natures: [:video, :image], formats: [:jpg, :png, :mp4])
39
+ ```
40
+
41
+ ## Creating your own parsers
42
+
43
+ In order to create new parsers, these have to meet two requirements:
44
+
45
+ 1) Instances of the new parser class needs to respond to a `call` method which takes one IO object as an argument and returns some metadata information about its corresponding file or nil otherwise.
46
+ 2) Instances of the new parser class needs to respond `natures` and `formats` accessor methods, both returning an array of symbols. A simple DSL is provided to avoid writing those accessors.
47
+ 3) The class needs to register itself as a parser.
48
+
49
+
50
+ Down below you can find a basic parser implementation:
51
+
52
+ ```ruby
53
+ class BasicParser
54
+ include FormatParser::DSL # Adds formats and natures methods to the class, which define
55
+ # accessor for all the instances.
56
+
57
+ formats :foo, :baz # Indicates which formats it can read.
58
+ natures :bar # Indicates which type of file from a human perspective it can read:
59
+ # - :audio
60
+ # - :document
61
+ # - :image
62
+ # - :video
63
+ def call(file)
64
+ # Returns a DTO object with including some metadata.
65
+ end
66
+
67
+ FormatParser.register_parser_constructor self # Register this parser.
68
+ ```
30
69
 
31
70
  ## Design rationale
32
71
 
@@ -15,8 +15,14 @@ Gem::Specification.new do |spec|
15
15
  minimum amount of data possible."
16
16
  spec.homepage = "https://github.com/WeTransfer/format_parser"
17
17
  spec.license = "MIT"
18
-
19
- # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # Alert people to a change in the gem's interface, will remove in a subsequent version
19
+ spec.post_install_message = %q{
20
+ -----------------------------------------------------------------------------
21
+ | ATTENTION: format_parser v0.2.0 introduces changes to the gem's interface.|
22
+ | See https://github.com/WeTransfer/format_parser#basic-usage |
23
+ | for up-to-date usage instructions. Thank you for using format_parser! :) |
24
+ -----------------------------------------------------------------------------
25
+ }
20
26
  # to allow pushing to a single host or delete this section to allow pushing to any host.
21
27
  if spec.respond_to?(:metadata)
22
28
  spec.metadata['allowed_push_host'] = "https://rubygems.org"
@@ -35,7 +41,7 @@ Gem::Specification.new do |spec|
35
41
  spec.add_dependency 'ks', '~> 0.0.1'
36
42
  spec.add_dependency 'exifr', '~> 1.0'
37
43
  spec.add_dependency 'faraday', '~> 0.13'
38
-
44
+
39
45
  spec.add_development_dependency 'rspec', '~> 3.0'
40
46
  spec.add_development_dependency 'rake', '~> 12'
41
47
  spec.add_development_dependency 'simplecov', '~> 0.15'
data/lib/audio.rb ADDED
@@ -0,0 +1,37 @@
1
+ module FormatParser
2
+ class Audio
3
+ NATURE = :audio
4
+
5
+ # Type of the file (e.g :mp3)
6
+ attr_accessor :format
7
+
8
+ # The number of audio channels for sound files that are muxed
9
+ # and for video files with embedded sound
10
+ attr_accessor :num_audio_channels
11
+
12
+ # SampeThe number of audio channels for sound files that are muxed
13
+ # and for video files with embedded sound
14
+ attr_accessor :audio_sample_rate_hz
15
+
16
+ # Duration of the media object (be it audio or video) in seconds,
17
+ # as a Float
18
+ attr_accessor :media_duration_seconds
19
+
20
+ # Duration of the media object in addressable frames or samples,
21
+ # as an Integer
22
+ attr_accessor :media_duration_frames
23
+
24
+ # If a parser wants to provide any extra information to the caller
25
+ # it can be placed here
26
+ attr_accessor :intrinsics
27
+
28
+ # Only permits assignments via defined accessors
29
+ def initialize(**attributes)
30
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
31
+ end
32
+
33
+ def nature
34
+ NATURE
35
+ end
36
+ end
37
+ end
data/lib/document.rb ADDED
@@ -0,0 +1,17 @@
1
+ module FormatParser
2
+ class Document
3
+ NATURE = :document
4
+
5
+ attr_accessor :format
6
+ attr_accessor :document_type
7
+
8
+ # Only permits assignments via defined accessors
9
+ def initialize(**attributes)
10
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
11
+ end
12
+
13
+ def nature
14
+ NATURE
15
+ end
16
+ end
17
+ end
data/lib/format_parser.rb CHANGED
@@ -1,12 +1,16 @@
1
1
  require 'thread'
2
2
 
3
3
  module FormatParser
4
- require_relative 'file_information'
4
+ require_relative 'image'
5
+ require_relative 'audio'
6
+ require_relative 'document'
7
+ require_relative 'video'
5
8
  require_relative 'io_utils'
6
9
  require_relative 'read_limiter'
7
10
  require_relative 'remote_io'
8
11
  require_relative 'io_constraint'
9
12
  require_relative 'care'
13
+ require_relative 'parsers/dsl'
10
14
 
11
15
  PARSER_MUX = Mutex.new
12
16
 
@@ -14,6 +18,13 @@ module FormatParser
14
18
  PARSER_MUX.synchronize do
15
19
  @parsers ||= []
16
20
  @parsers << object_responding_to_new
21
+ # Gathering natures and formats from parsers. An instance has to be created.
22
+ parser = object_responding_to_new.new
23
+ @natures ||= Set.new
24
+ # NOTE: merge method for sets modify the instance.
25
+ @natures.merge(parser.natures)
26
+ @formats ||= Set.new
27
+ @formats.merge(parser.formats)
17
28
  end
18
29
  end
19
30
 
@@ -30,25 +41,32 @@ module FormatParser
30
41
  parse(cached_io)
31
42
  end
32
43
 
33
- def self.parse(io)
44
+ def self.parse(io, natures: @natures.to_a, formats: @formats.to_a, returns: :all)
34
45
  # If the cache is preconfigured do not apply an extra layer. It is going
35
46
  # to be preconfigured when using parse_http.
36
47
  io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
37
48
 
49
+ # How many results has the user asked for? Used to determinate whether an array
50
+ # is returned or not.
51
+ amount = case returns
52
+ when :all
53
+ @parsers.count
54
+ when :one
55
+ 1
56
+ else
57
+ throw ArgumentError.new(":returns does not match any supported mode (:all, :one)")
58
+ end
59
+
38
60
  # Always instantiate parsers fresh for each input, since they might
39
61
  # contain instance variables which otherwise would have to be reset
40
62
  # between invocations, and would complicate threading situations
41
- parsers = @parsers.map(&:new)
42
-
43
- parsers.each do |parser|
63
+ results = parsers_for(natures, formats).map do |parser|
44
64
  # We need to rewind for each parser, anew
45
65
  io.seek(0)
46
66
  # Limit how many operations the parser can perform
47
67
  limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
48
68
  begin
49
- if info = parser.information_from_io(limited_io)
50
- return info
51
- end
69
+ parser.call(limited_io)
52
70
  rescue IOUtils::InvalidRead
53
71
  # There was not enough data for this parser to work on,
54
72
  # and it triggered an error
@@ -57,8 +75,21 @@ module FormatParser
57
75
  # caused the parser to go off-track. Strictly speaking we should log this
58
76
  # and examine the file more closely.
59
77
  end
60
- end
61
- nil # Nothing matched
78
+ end.reject(&:nil?).take(amount)
79
+
80
+ return results.first if amount == 1
81
+ # Convert the results from a lazy enumerator to an array.
82
+ results.to_a
83
+ end
84
+
85
+ private
86
+
87
+ def self.parsers_for(natures, formats)
88
+ # returns lazy enumerator for only computing the minimum amount of work (see :returns keyword argument)
89
+ @parsers.map(&:new).select do |parser|
90
+ # Do a given parser contain any nature and/or format asked by the user?
91
+ (natures & parser.natures).size > 0 && (formats & parser.formats).size > 0
92
+ end.lazy
62
93
  end
63
94
 
64
95
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.1.7'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -1,13 +1,11 @@
1
1
  module FormatParser
2
- class FileInformation
3
-
4
- # What kind of file is it?
5
- attr_accessor :file_nature
2
+ class Image
3
+ NATURE = :image
6
4
 
7
5
  # What filetype was recognized? Will contain a non-ambiguous symbol
8
6
  # referring to the file format. The symbol can be used as a filename
9
7
  # extension safely
10
- attr_accessor :file_type
8
+ attr_accessor :format
11
9
 
12
10
  # Number of pixels horizontally in the pixel buffer
13
11
  attr_accessor :width_px
@@ -42,25 +40,6 @@ module FormatParser
42
40
  # http://magnushoff.com/jpeg-orientation.html
43
41
  attr_accessor :image_orientation
44
42
 
45
- # The number of audio channels for sound files that are muxed
46
- # and for video files with embedded sound
47
- attr_accessor :num_audio_channels
48
-
49
- # SampeThe number of audio channels for sound files that are muxed
50
- # and for video files with embedded sound
51
- attr_accessor :audio_sample_rate_hz
52
-
53
- # Duration of the media object (be it audio or video) in seconds,
54
- # as a Float
55
- attr_accessor :media_duration_seconds
56
-
57
- # Duration of the media object in addressable frames or samples,
58
- # as an Integer
59
- attr_accessor :media_duration_frames
60
-
61
- # XML Document Type
62
- attr_accessor :document_type
63
-
64
43
  # If a parser wants to provide any extra information to the caller
65
44
  # it can be placed here
66
45
  attr_accessor :intrinsics
@@ -70,8 +49,8 @@ module FormatParser
70
49
  attributes.map { |(k, v)| public_send("#{k}=", v) }
71
50
  end
72
51
 
73
- def self.image(**kwargs)
74
- new(file_nature: :image, **kwargs)
52
+ def nature
53
+ NATURE
75
54
  end
76
55
  end
77
56
  end
@@ -1,5 +1,6 @@
1
1
  class FormatParser::AIFFParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
3
4
 
4
5
  # Known chunk types we can omit when parsing,
5
6
  # grossly lifted from http://www.muratnkonar.com/aiff/
@@ -18,7 +19,10 @@ class FormatParser::AIFFParser
18
19
  'ANNO',
19
20
  ]
20
21
 
21
- def information_from_io(io)
22
+ natures :audio
23
+ formats :aiff
24
+
25
+ def call(io)
22
26
  io = FormatParser::IOConstraint.new(io)
23
27
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
28
  return unless form_chunk_type == "FORM" && chunk_size > 4
@@ -62,9 +66,8 @@ class FormatParser::AIFFParser
62
66
  duration_in_seconds = sample_frames / sample_rate
63
67
  return unless duration_in_seconds > 0
64
68
 
65
- FormatParser::FileInformation.new(
66
- file_nature: :audio,
67
- file_type: :aiff,
69
+ FormatParser::Audio.new(
70
+ format: :aiff,
68
71
  num_audio_channels: channels,
69
72
  audio_sample_rate_hz: sample_rate.to_i,
70
73
  media_duration_frames: sample_frames,
@@ -1,5 +1,10 @@
1
1
  class FormatParser::DPXParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
4
+
5
+ natures :image
6
+ formats :dpx
7
+
3
8
  FILE_INFO = [
4
9
  # :x4, # magic bytes SDPX, we read them anyway so not in the pattern
5
10
  :x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
@@ -124,7 +129,7 @@ class FormatParser::DPXParser
124
129
  LE_MAGIC = BE_MAGIC.reverse
125
130
  HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
131
 
127
- def information_from_io(io)
132
+ def call(io)
128
133
  io = FormatParser::IOConstraint.new(io)
129
134
  magic = io.read(4)
130
135
 
@@ -133,8 +138,8 @@ class FormatParser::DPXParser
133
138
  unpack_pattern = DPX_INFO
134
139
  unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
135
140
  num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
136
- FormatParser::FileInformation.image(
137
- file_type: :dpx,
141
+ FormatParser::Image.new(
142
+ format: :dpx,
138
143
  width_px: pixels_per_line,
139
144
  height_px: num_lines,
140
145
  )
@@ -0,0 +1,29 @@
1
+ module FormatParser
2
+ # Small DSL to avoid repetitive code while defining a new parsers. Also, it can be leveraged by
3
+ # third parties to define their own parsers.
4
+ module DSL
5
+ def self.included(base)
6
+ base.extend(ClassMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+ def formats(*registred_formats)
11
+ __define(:formats, registred_formats)
12
+ end
13
+
14
+ def natures(*registred_natures)
15
+ __define(:natures, registred_natures)
16
+ end
17
+
18
+ private
19
+
20
+ def __define(name, value)
21
+ throw ArgumentError("empty array") if value.empty?
22
+ throw ArgumentError("requires array of symbols") if value.any? { |s| !s.is_a?(Symbol) }
23
+ define_method(name) do
24
+ value
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,26 +1,29 @@
1
1
  class FormatParser::FDXParser
2
2
  include FormatParser::IOUtils
3
+ include FormatParser::DSL
3
4
 
4
- def information_from_io(io)
5
+ formats :fdx
6
+ natures :document
7
+
8
+ def call(io)
5
9
  return if !xml_check(io)
6
10
  file_and_document_type = safe_read(io, 100)
7
11
  file_type, document_type = check_for_document_type(file_and_document_type)
8
12
  return if file_type != :fdx
9
- file_info = FormatParser::FileInformation.new(
10
- file_nature: :document,
11
- file_type: file_type,
13
+ FormatParser::Document.new(
14
+ format: file_type,
12
15
  document_type: document_type
13
16
  )
14
17
  end
15
18
 
16
19
  def xml_check(io)
17
20
  xml_check = safe_read(io, 5)
18
- xml_check == "<?xml" ? true : false
21
+ xml_check == "<?xml"
19
22
  end
20
-
23
+
21
24
  def check_for_document_type(file_and_document_type)
22
25
  sanitized_data = file_and_document_type.downcase
23
- if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
26
+ if sanitized_data.include?("finaldraft") && sanitized_data.include?("script")
24
27
  return :fdx, :script
25
28
  else
26
29
  return