checking-you-out 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +661 -0
- data/README.md +14 -0
- data/bin/are-we-unallocated-yet +9 -0
- data/bin/benchmark +66 -0
- data/bin/checking-you-out +8 -0
- data/bin/repl +7 -0
- data/bin/test-my-best +4 -0
- data/lib/checking-you-out.rb +40 -0
- data/lib/checking-you-out/auslandsgesprach.rb +253 -0
- data/lib/checking-you-out/ghost_revival.rb +71 -0
- data/lib/checking-you-out/ghost_revival/mr_mime.rb +390 -0
- data/lib/checking-you-out/ghost_revival/xross_infection.rb +146 -0
- data/lib/checking-you-out/inner_spirit.rb +215 -0
- data/lib/checking-you-out/party_starter.rb +202 -0
- data/lib/checking-you-out/party_starter/stick_around.rb +260 -0
- data/lib/checking-you-out/party_starter/weighted_action.rb +41 -0
- data/lib/checking-you-out/sweet_sweet_love_magic.rb +226 -0
- data/mime/packages/distorted-types.xml +68 -0
- data/mime/packages/third-party/shared-mime-info/freedesktop.org.xml.in +7672 -0
- data/mime/packages/third-party/tika-mimetypes/tika-mimetypes.xml +2762 -0
- metadata +232 -0
@@ -0,0 +1,215 @@
|
|
1
|
+
require 'set' unless defined? ::Set
|
2
|
+
require 'pathname' unless defined? ::Pathname
|
3
|
+
|
4
|
+
|
5
|
+
# Utility Modules/procs/lambdas/etc for generic operations like checking WeightedActions.
|
6
|
+
require_relative 'party_starter' unless defined? ::CHECKING::YOU::WeightedAction
|
7
|
+
|
8
|
+
|
9
|
+
# This base Struct will be used as the Hash key for its matching `OUT` subclass object,
|
10
|
+
# and its members correspond to the three major parts of an IETF "Content-Type" String,
|
11
|
+
# e.g. "application/x-saturn-rom" → :x, :application, :"saturn-rom".
|
12
|
+
#
|
13
|
+
# This is kind of a leaky abstraction since I want to support non-IETF type systems too,
|
14
|
+
# but the IETF system is by far the most relevant one to us because the most exhaustive
|
15
|
+
# source data (`shared-mime-info`) is based on that format and because, you know, Internet.
|
16
|
+
# See the adjacent `auslandsgespräch.rb` for the parser and more info.
|
17
|
+
#
|
18
|
+
#
|
19
|
+
# The instances of a `Struct` subclass with at most `RSTRUCT_EMBED_LEN_MAX` members
|
20
|
+
# can fit entirely within an `RStruct` without additional heap allocation.
|
21
|
+
# In MRI (at least as of 3.0) the `RSTRUCT_EMBED_LEN_MAX` is 3, so CYI uses three members.
|
22
|
+
#
|
23
|
+
# For more info see:
|
24
|
+
# - https://github.com/ruby/ruby/blob/master/gc.c
|
25
|
+
# - http://patshaughnessy.net/2013/2/8/ruby-mri-source-code-idioms-3-embedded-objects
|
26
|
+
CHECKING::YOU::IN ||= Struct.new(
|
27
|
+
# Intentionally avoiding naming taxonomic ranks like "domain", "class", or "order"
|
28
|
+
# whose names are already common in computing.
|
29
|
+
:kingdom,
|
30
|
+
:phylum,
|
31
|
+
:genus,
|
32
|
+
) do
|
33
|
+
# Promote any CYI to its CYO singleton. CYO has the opposites of these methods.
|
34
|
+
def out; ::CHECKING::YOU::OUT::new(self); end
|
35
|
+
def in; self; end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Main Struct subclass for in-memory type representation.
|
39
|
+
# Instances of the base `CHECKING::YOU::IN` Struct will refer to only one of these,
|
40
|
+
# and this matching object will contain all relevant data about the type,
|
41
|
+
# such as file extension(s), `magic` bytes, and variations of a base type like all of:
|
42
|
+
# - "application/vnd.wordperfect;"
|
43
|
+
# - "application/vnd.wordperfect;version=4.2"
|
44
|
+
# - "application/vnd.wordperfect;version=5.0"
|
45
|
+
# - "application/vnd.wordperfect;version=5.1"
|
46
|
+
# - "application/vnd.wordperfect;version=6.x"
|
47
|
+
# …will be represented in a single `CHECKING::YOU::OUT` object.
|
48
|
+
class ::CHECKING::YOU::OUT < ::CHECKING::YOU::IN
|
49
|
+
|
50
|
+
# Absolute path to the root of the Gem — the directory containing `bin`,`docs`,`lib`, etc.
|
51
|
+
GEM_ROOT = proc { ::Pathname.new(__dir__).join(*Array.new(2, -'..')).expand_path.realpath }
|
52
|
+
|
53
|
+
# Time object representing the day this running CYO Gem was packaged.
|
54
|
+
#
|
55
|
+
# `Gem::Specification#date` can be slightly misleading when developing locally with Bundler using `bundle exec`.
|
56
|
+
# One might expect the result of `#date` to be "now" (including hours/minutes/seconds) in UTC for such a runtime-packaged Gem,
|
57
|
+
# but it will always be midnight UTC of the current day (also in UTC), i.e. a date that is always[0] in the past.
|
58
|
+
#
|
59
|
+
# After ${your-UTC-offset} hours before midnight localtime, this will give you a *day* that seems to be in the future
|
60
|
+
# compared to a system clock displaying localtime despite that *date* UTC still being in the past,
|
61
|
+
# e.g. as I write this comment at 2021-05-25 22:22 PST, `GEM_PACKAGE_TIME.call` returns `2021-05-26 00:00:00 UTC`.
|
62
|
+
#
|
63
|
+
# Rescue from `Gem::MissingSpecError`'s parent to support developing locally with just `require_relative` and no Bundler.
|
64
|
+
#
|
65
|
+
# [0]: unless you manage to `bundle exec` at exactly 00:00:00 UTC :)
|
66
|
+
GEM_PACKAGE_TIME = proc { begin; Gem::Specification::find_by_name(-'checking-you-out').date; rescue Gem::LoadError; Time.now; end }
|
67
|
+
|
68
|
+
Species = Struct.new(:name, :value) do
|
69
|
+
def self.from_string(param_string)
|
70
|
+
return self.new(*param_string.split(-?=))
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Main memoization Hash for our loaded Type data.
|
75
|
+
# { CHECKING::YOU::IN => CHECKING::YOU::OUT }
|
76
|
+
def self.all_night; @all_night ||= Hash.new(nil); end
|
77
|
+
|
78
|
+
# Return a singleton instance for any CYO.
|
79
|
+
def self.new(taxa)
|
80
|
+
# Support IETF String argument to this method, e.g. ::CHECKING::YOU::OUT::new('application/octet-stream')
|
81
|
+
return self.from_ietf_media_type(taxa) if taxa.is_a?(String)
|
82
|
+
# Otherwise return the memoized CYO singleton of this type.
|
83
|
+
self.all_night[
|
84
|
+
taxa.is_a?(::CHECKING::YOU::IN) ? taxa : super(*taxa)
|
85
|
+
] ||= self.allocate.tap { |cyo| cyo.send(:initialize, *taxa) }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Demote any CYO to a CYI that can be passed around in just 40 bytes.
|
89
|
+
# CYI has the opposites of these methods.
|
90
|
+
def out; self; end
|
91
|
+
def in; self.class.all_night.key(self); end
|
92
|
+
|
93
|
+
|
94
|
+
# Get a CYO, Set[CYO], or nil by file-extension, e.g. `doc` => { CYO msword, CYO rtf }.
|
95
|
+
POSTFIX_KEY = proc {
|
96
|
+
# Re-use a single search structure to avoid allocating an Object per search.
|
97
|
+
scratch = ::CHECKING::YOU::StickAround.new(-'')
|
98
|
+
# Additionally accelerate multiple searches for the same thing by avoiding `StickAround#replace`
|
99
|
+
# if the new search key already matches the previous search key.
|
100
|
+
# Mark `case_sensitive: false` here for testing arbitrarily-named input.
|
101
|
+
-> { scratch.eql?(_1) ? scratch : scratch.replace(_1, case_sensitive: false) }
|
102
|
+
}.call
|
103
|
+
def self.from_postfix(stick_around)
|
104
|
+
self.instance_variable_get(:@after_forever)[POSTFIX_KEY.call(stick_around)]
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get a Hash[CYO] or nil for arbitrary non-file-extension glob match of a File basename.
|
108
|
+
def self.from_glob(stick_around)
|
109
|
+
self.instance_variable_get(:@stick_around).select { |k,v|
|
110
|
+
k.eql?(stick_around)
|
111
|
+
}.yield_self { |matched|
|
112
|
+
matched.empty? ? nil : matched
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.from_pathname(pathname)
|
117
|
+
return self.from_glob(pathname) || self.from_postfix(pathname)
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
# Add a new Postfix or Glob for a specific type.
|
122
|
+
def add_pathname_fragment(fragment)
|
123
|
+
if fragment.start_with?(-'*.') and fragment.count(-?.) == 1 and fragment.count(-?*) == 1 then
|
124
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@postfixes, fragment, self)
|
125
|
+
::CHECKING::YOU::CLASS_NEEDLEMAKER.call(:@after_forever, fragment, self)
|
126
|
+
else
|
127
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@globs, fragment, self)
|
128
|
+
::CHECKING::YOU::CLASS_NEEDLEMAKER.call(:@stick_around, fragment, self)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# Get a `Set` of this CYO and all of its parent CYOs, at minimum just `Set[self]`.
|
134
|
+
def aka
|
135
|
+
return case @aka
|
136
|
+
when nil then Set[self.in]
|
137
|
+
when self.class, self.class.superclass then Set[self.in, @aka]
|
138
|
+
when ::Set then Set[self.in, *@aka]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Take an additional CYI, store it locally, and memoize it as an alias for this CYO.
|
143
|
+
def add_aka(taxa)
|
144
|
+
taxa = taxa.is_a?(::CHECKING::YOU::IN) ? taxa : self.class.superclass.new(*taxa)
|
145
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@aka, taxa, self)
|
146
|
+
self.class.all_night[taxa] = self
|
147
|
+
end
|
148
|
+
|
149
|
+
# Forget a CYI alias of this Type. Capable of unsetting the "real" CYI as well if desired.
|
150
|
+
def remove_aka(taxa)
|
151
|
+
taxa = taxa.is_a?(::CHECKING::YOU::IN) ? taxa : self.class.superclass.new(*taxa)
|
152
|
+
self.class.all_night.delete(taxa) if self.class.all_night[taxa] === self
|
153
|
+
end
|
154
|
+
|
155
|
+
attr_reader :parents, :children
|
156
|
+
|
157
|
+
# Take an additional CYO, store it locally as our parent, and ask it to add ourselves as its child.
|
158
|
+
def add_parent(parent_cyo)
|
159
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@parents, parent_cyo, self)
|
160
|
+
parent_cyo.add_child(self) unless parent_cyo.children&.include?(self)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Take an additional CYO, store it locally as our child, and ask it to add ourselves as its parent.
|
164
|
+
def add_child(child_cyo)
|
165
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@children, child_cyo, self)
|
166
|
+
child_cyo.add_parent(self) unless child_cyo.parents&.include?(self)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Get a `Set` of this CYO and all of its parent CYOs, at minimum just `Set[self]`.
|
170
|
+
def adults_table
|
171
|
+
return case @parents
|
172
|
+
when nil then Set[self]
|
173
|
+
when self.class, self.class.superclass then Set[self, @parents]
|
174
|
+
when ::Set then Set[self, *@parents]
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# Get a `Set` of this CYO and all of its child CYOs, at minimum just `Set[self]`.
|
179
|
+
def kids_table
|
180
|
+
return case @children
|
181
|
+
when nil then Set[self]
|
182
|
+
when self.class, self.class.superclass then Set[self, @children]
|
183
|
+
when ::Set then Set[self, *@children]
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
# Get a `Set` of this CYO and all parents and children, at minimum just `Set[self]`.
|
188
|
+
def family_tree; self.kids_table | self.adults_table; end
|
189
|
+
|
190
|
+
# Storage for descriptions (`<comment>`), acrnyms, suitable iconography, and other boring metadata, e.g.:
|
191
|
+
# <mime-type type="application/vnd.oasis.opendocument.text">
|
192
|
+
# <comment>ODT document</comment>
|
193
|
+
# <acronym>ODT</acronym>
|
194
|
+
# <expanded-acronym>OpenDocument Text</expanded-acronym>
|
195
|
+
# <generic-icon name="x-office-document"/>
|
196
|
+
# […]
|
197
|
+
# </mini-type>
|
198
|
+
attr_accessor :description
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
# IETF Media-Type parser and methods that use that parser.
|
203
|
+
require_relative 'auslandsgesprach' unless defined? ::CHECKING::YOU::IN::AUSLANDSGESPRÄCH
|
204
|
+
::CHECKING::YOU::IN.extend(::CHECKING::YOU::IN::AUSLANDSGESPRÄCH)
|
205
|
+
::CHECKING::YOU::IN.include(::CHECKING::YOU::IN::INLANDGESPRÄCH)
|
206
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::OUT::AUSLANDSGESPRÄCH)
|
207
|
+
|
208
|
+
# Content matching à la `libmagic`/`file`.
|
209
|
+
require_relative 'sweet_sweet_love_magic' unless defined? ::CHECKING::YOU::SweetSweet♥Magic
|
210
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::SweetSweet♡Magic)
|
211
|
+
::CHECKING::YOU::OUT.prepend(::CHECKING::YOU::SweetSweet♥Magic)
|
212
|
+
|
213
|
+
# Methods for loading type data from `shared-mime-info` package XML files.
|
214
|
+
require_relative 'ghost_revival' unless defined? ::CHECKING::YOU::GHOST_REVIVAL
|
215
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::OUT::GHOST_REVIVAL)
|
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'set' unless defined? ::Set
|
2
|
+
require 'pathname' unless defined? ::Pathname
|
3
|
+
|
4
|
+
|
5
|
+
# This file defines/imports various utility Modules/procs/etc that should be available
|
6
|
+
# to all other CYO components without `including`/`extending`.
|
7
|
+
require_relative 'party_starter/weighted_action' unless defined? ::CHECKING::YOU::WeightedAction
|
8
|
+
require_relative 'party_starter/stick_around' unless defined? ::CHECKING::YOU::StickAround
|
9
|
+
|
10
|
+
class CHECKING::YOU
|
11
|
+
|
12
|
+
# The following two `proc`s handle classwide-memoization and instance-level assignment
|
13
|
+
# for values that may be Enumerable but often refer to only a single Object.
|
14
|
+
#
|
15
|
+
# For example, most `Postfix`es (file extensions) will only ever belong to a single CYO Object,
|
16
|
+
# but a handful represent possibly-multiple types, like how `.doc` can be an MSWord file or WordPad RTF.
|
17
|
+
#
|
18
|
+
# These assignment procs take a storage haystack, a needle to store, and the CYO receiver to which the needle refers.
|
19
|
+
# They will set `haystack[needle] => CYO` if that needle is unique and unset, or they will convert
|
20
|
+
# an existing single `haystack[needle] => CYO` assignment to `haystack[needle] => Set[existingCYO, newCYO]`.
|
21
|
+
#
|
22
|
+
# This is an admittedly-annoying complexity-for-performance tradeoff with the goal of allocating
|
23
|
+
# as few spurious containers as possible instead of explicitly initializing a Set for every needle
|
24
|
+
# when most of them would wastefully be a Set of just a single thing.
|
25
|
+
CLASS_NEEDLEMAKER = proc { |haystack, needle, receiver|
|
26
|
+
# Create the container if this is the very first invocation.
|
27
|
+
receiver.class.instance_variable_set(haystack, Hash.new(nil)) unless receiver.class.instance_variable_defined?(haystack)
|
28
|
+
|
29
|
+
# Set the `haystack` Hash's `needle` key to the `receiver` if the `key` is unset, otherwise
|
30
|
+
# to a `Set` of the existing value plus `receiver` if that value is not `receiver` already.
|
31
|
+
receiver.class.instance_variable_get(haystack).tap { |awen|
|
32
|
+
case awen[needle]
|
33
|
+
when nil then awen[needle] = receiver
|
34
|
+
when ::Set then awen[needle].add(receiver)
|
35
|
+
when receiver.class then awen[needle] = Set[awen[needle], receiver] unless awen[needle] == receiver
|
36
|
+
end
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
# This is the instance-level version of the above, e.g. a CYO with only one Postfix
|
41
|
+
# will assign `cyo.:@postfixes = Postfix`, and a CYO with many Postfixes will assign
|
42
|
+
# e.g. `cyo.:@postfixes = Set[post, fix, es, …]`.
|
43
|
+
INSTANCE_NEEDLEMAKER = proc { |haystack, needle, receiver|
|
44
|
+
if receiver.instance_variable_defined?(haystack) then
|
45
|
+
receiver.instance_variable_get(haystack).add(needle)
|
46
|
+
else
|
47
|
+
receiver.instance_variable_set(haystack, Set[needle])
|
48
|
+
end
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
# Test a Pathname representing an extant file whose contents and metadata we can use.
|
53
|
+
# This is separated into a lambda due to the complexity, since the entry-point might
|
54
|
+
# be given a String that could represent a Media Type, a hypothetical path,
|
55
|
+
# an extant path, or even raw stream contents. It could be given a Pathname representing
|
56
|
+
# either a hypothetical or extant file. It could be given an IO/Stream object.
|
57
|
+
# Several input possibilities will end up callin this lambda.
|
58
|
+
#
|
59
|
+
# Some of this complexity is my fault, since I'm doing a lot of variable juggling
|
60
|
+
# to avoid as many new-Object-allocations as possible in the name of performance
|
61
|
+
# since this library is the very core-est core of DistorteD;
|
62
|
+
# things like assigning Hash values to single CYO objects the first time that key is stored
|
63
|
+
# then replacing that value with a Set iff that key needs to reference any additional CYO.
|
64
|
+
#
|
65
|
+
# - `::from_xattr` can return `nil` or a single `CYO` depending on filesystem extended attributes.
|
66
|
+
# It is very very unlikely that most people will ever use this, but I think it's cool 8)
|
67
|
+
#
|
68
|
+
# - `::from_postfix` can return `nil`, `CYO`, or `Set` since I decided to store Postfixes
|
69
|
+
# separately from freeform globs since file-extension matches are the vast majority of globs.
|
70
|
+
# Postfixes avoid needing to be weighted since they all represent the same final pathname component
|
71
|
+
# and should never result in multiple conflicting Postfix key matches.
|
72
|
+
# A single Postfix key can represent multiple CYOs, though; hence the possible `Set`.
|
73
|
+
#
|
74
|
+
# - `::from_glob` can return `nil` or `Hash` since even a single match will include the weighted key.
|
75
|
+
#
|
76
|
+
# - `::from_content` can return `nil` or `Hash` based on a `libmagic`-style match of file/stream contents.
|
77
|
+
# Many common types can be determined from the first four bytes alone, but we support matching
|
78
|
+
# arbitrarily-long sequences against arbitrarily-big byte range boundaries.
|
79
|
+
# These keys will also be weighted, even for a single match.
|
80
|
+
TEST_EXTANT_PATHNAME = -> (pathname, so_deep: true, only_one_match: true) {
|
81
|
+
|
82
|
+
# Never return empty Enumerables.
|
83
|
+
# Yielding-self to this proc will `nil`-ify anything that's `:empty?`
|
84
|
+
# and will pass any non-Enumerable Objects through.
|
85
|
+
point_zero = proc { _1.respond_to?(:empty) ? (_1.empty? ? nil : _1) : _1 }
|
86
|
+
|
87
|
+
# Our matching block will return a single CYO when possible, and can optionally
|
88
|
+
# return multiple CYO matches for ambiguous files/streams.
|
89
|
+
# Multiple matching must be opted into with `only_one_match: false` so it doesn't need to be
|
90
|
+
# checked by every caller that's is fine with best-effort and wants to minimize allocations.
|
91
|
+
one_or_eight = proc { |huh|
|
92
|
+
case
|
93
|
+
when huh.nil? then nil
|
94
|
+
when huh.respond_to?(:empty?), huh.respond_to?(:first?)
|
95
|
+
if huh.empty? then nil
|
96
|
+
elsif huh.size == 1 then huh.is_a?(::Hash) ? huh.values.first : huh.first
|
97
|
+
elsif huh.size > 1 and only_one_match then huh.is_a?(::Hash) ? huh.values.first : huh.first
|
98
|
+
else huh
|
99
|
+
end
|
100
|
+
else huh
|
101
|
+
end
|
102
|
+
}
|
103
|
+
|
104
|
+
# Test all "glob" matches against all child Types of all "magic" matches to allow for
|
105
|
+
# nuanced detection of ambiguous streams where a `magic` match returns multiple possibilities,
|
106
|
+
# e.g. using a `.doc` Postfix-match to choose a `text-plain` glob-match for non-Word `.doc` files
|
107
|
+
# or to choose a `application/msword` glob-match over a more generic `application/x-ole-storage`
|
108
|
+
# magic-match when the magic weights alone are not enough information to make the correct choice.
|
109
|
+
# irb> ::CHECKING::YOU::OUT::from_postfix('doc')
|
110
|
+
# => #<Set: {#<CHECKING::YOU::OUT application/msword>, #<CHECKING::YOU::OUT text/plain>}>
|
111
|
+
#
|
112
|
+
# Again, a lot of the complexity here is "my fault" in that I could avoid it by explicitly using
|
113
|
+
# the same data structures for all the different inputs, but I need this to be as fast
|
114
|
+
# and as low-overhead as possible which means avoiding allocations of things like
|
115
|
+
# Enumerables that end up holding only a single other object.
|
116
|
+
# Obviously that leads to a lot of variation in result values from helper methods,
|
117
|
+
# so I'll own that here instead of ever making callsites deal with it.
|
118
|
+
#
|
119
|
+
# This `proc`'s output will introduce a little more of that same complexity since it will be `nil`
|
120
|
+
# if either input is `nil`, will be a single CYO if there is only one union match,
|
121
|
+
# or a `Set` if there are still multiple possibilities.
|
122
|
+
magic_children = proc { |glob, magic|
|
123
|
+
# NOTE: CYO deviates from `shared-mime-info`'s behavior very slightly here!
|
124
|
+
#
|
125
|
+
# `shared-mime-info`'s "Recommended checking order" documentation sez:
|
126
|
+
# "If any of the mimetypes resulting from a glob match is equal to or a subclass of the result
|
127
|
+
# from the magic sniffing, use this as the result. This allows us for example to distinguish text files
|
128
|
+
# called 'foo.doc' from MS-Word files with the same name, as the magic match for the MS-Word file would be
|
129
|
+
# `application/x-ole-storage` which the MS-Word type inherits."
|
130
|
+
#
|
131
|
+
# Our behavior is identical except it allows glob matches which are a *superclass* of a
|
132
|
+
# magic-match in addition to subclass or equal-to, i.e. using `:family_tree` for comparison here
|
133
|
+
# instead of using `:kids_table`. There might be a downside to this that I haven't found yet
|
134
|
+
# but it allows CYO to better match some things, e.g. matching a `'.flv'` video file as
|
135
|
+
# `'video/x-flv'` instead of as `'video/x-javafx'`, since fd.o has the latter as a subclass of the former.
|
136
|
+
case [glob, magic]
|
137
|
+
in ::NilClass, * then nil
|
138
|
+
in *, ::NilClass then nil
|
139
|
+
in ::Set, ::Hash then glob & magic.values.to_set.map(&:family_tree).reduce(&:&)
|
140
|
+
in ::Set, ::CHECKING::YOU::OUT then glob & magic.kids_table
|
141
|
+
in ::Hash, ::Hash then glob.values.to_set & magic.values.to_set.map(&:family_tree).reduce(&:&)
|
142
|
+
in ::CHECKING::YOU::OUT, ::Hash then magic.values.to_set.map(&:family_tree).reduce(&:&)&.include?(glob) ? glob : nil
|
143
|
+
in ::Hash, ::CHECKING::YOU::OUT then glob.values.to_set & magic.kids_table
|
144
|
+
in ::CHECKING::YOU::OUT, ::CHECKING::YOU::OUT then glob == magic ? glob : nil
|
145
|
+
else nil
|
146
|
+
end.yield_self(&point_zero)
|
147
|
+
}
|
148
|
+
|
149
|
+
# "If a MIME type is provided explicitly (eg, by a ContentType HTTP header, a MIME email attachment,
|
150
|
+
# an extended attribute or some other means) then that should be used instead of guessing."
|
151
|
+
# This will probably always be `nil` since this is a niche feature, but we have to test it first.
|
152
|
+
::CHECKING::YOU::OUT::from_xattr(pathname) || begin
|
153
|
+
|
154
|
+
# "Start by doing a glob match of the filename. Keep only globs with the biggest weight."
|
155
|
+
# "If the patterns are different, keep only matched with the longest pattern."
|
156
|
+
# If after this, there is one or more matching glob, and all the matching globs result in
|
157
|
+
# the same mimetype, use that mimetype as the result."
|
158
|
+
# This can be `nil`, `CYO`, a `Set` of Postfix matches, or a `Hash` of weighted Glob matches.
|
159
|
+
glob_matched = ::CHECKING::YOU::OUT::from_pathname(pathname)
|
160
|
+
|
161
|
+
# "If the glob matching fails or results in multiple conflicting mimetypes,
|
162
|
+
# read the contents of the file and do magic sniffing on it.
|
163
|
+
# This can be `nil` or a `Hash` of weighted magic matches.
|
164
|
+
magic_matched = (glob_matched.nil? || glob_matched.is_a?(Enumerable) || so_deep) ? ::CHECKING::YOU::OUT::from_content(pathname) : nil
|
165
|
+
|
166
|
+
# Make a decision based on the two possible matches above plus a third match category
|
167
|
+
# based on a union between the glob match and all children of all magic matches.
|
168
|
+
# See the relevant proc above. Its result will always be `nil` if either input is `nil`.
|
169
|
+
#
|
170
|
+
# "If there was no glob match, use the magic match as the result."
|
171
|
+
# "Otherwise use the result of the glob match that has the highest weight."
|
172
|
+
return case [glob_matched, magic_matched, magic_children.call(glob_matched, magic_matched)]
|
173
|
+
in ::NilClass, ::Hash, ::NilClass then LEGENDARY_HEAVY_GLOW.call(magic_matched, :weight)
|
174
|
+
in ::CHECKING::YOU::OUT, ::NilClass, ::NilClass then glob_matched
|
175
|
+
in ::Set, ::NilClass, ::NilClass then glob_matched
|
176
|
+
in ::Hash, ::NilClass, ::NilClass then LEGENDARY_HEAVY_GLOW.call(glob_matched, [:weight, :length])
|
177
|
+
in *, ::CHECKING::YOU::OUT => only_one_type then only_one_type
|
178
|
+
in ::Set, ::Hash, ::Set => magic_children then
|
179
|
+
# Choose the union-matched type having the the heaviest magic-matched weight.
|
180
|
+
LEGENDARY_HEAVY_GLOW.call(magic_matched.keep_if { |_magic, cyo| magic_children.include?(cyo) }, :weight)
|
181
|
+
in ::Hash, ::Hash, ::Set => magic_children then
|
182
|
+
# Choose the union-matched type having the heaviest glob-matched weight,
|
183
|
+
# and then additionally the longest glob string if there are still multiple matches.
|
184
|
+
LEGENDARY_HEAVY_GLOW.call(glob_matched.keep_if { |_glob, cyo| magic_children.include?(cyo) }, [:weight, :length])
|
185
|
+
in ::CHECKING::YOU::OUT, ::Hash, ::NilClass then glob_matched
|
186
|
+
in ::CHECKING::YOU::OUT, ::Hash, ::Set => magic_children then
|
187
|
+
# Choose the single glob-matched type iff it was also magic-matched,
|
188
|
+
# otherwise choose the heaviest magic-matched type.
|
189
|
+
magic_matched.values.include?(glob_matched) ? glob_matched : LEGENDARY_HEAVY_GLOW.call(magic_matched, :weight)
|
190
|
+
in ::NilClass, ::NilClass, ::NilClass then
|
191
|
+
# "If no magic rule matches the data (or if the content is not available),
|
192
|
+
# use the default type of application/octet-stream for binary data, or text/plain for textual data."
|
193
|
+
# "Note: Checking the first 128 bytes of the file for ASCII control characters is a good way to guess
|
194
|
+
# whether a file is binary or text, but note that files with high-bit-set characters should still be
|
195
|
+
# treated as text since these can appear in UTF-8 text, unlike control characters.
|
196
|
+
::CHECKING::YOU::OUT::from_ietf_media_type('application/octet-stream')
|
197
|
+
else nil
|
198
|
+
end.yield_self(&one_or_eight)
|
199
|
+
end # ::CHECKING::YOU::OUT::from_xattr(pathname) || begin
|
200
|
+
} # TEST_EXTANT_PATHNAME
|
201
|
+
|
202
|
+
end # class CHECKING::YOU
|
@@ -0,0 +1,260 @@
|
|
1
|
+
require 'pathname' unless defined?(::Pathname)
|
2
|
+
|
3
|
+
|
4
|
+
class CHECKING::YOU
|
5
|
+
# Provide case-optional String-like keys for Postfixes, Globs, etc.
|
6
|
+
#
|
7
|
+
# From Ruby's `Hash` docs: "Two objects refer to the same hash key when their hash value is identical
|
8
|
+
# and the two objects are eql? to each other"
|
9
|
+
# I tried to subclass String and just override `:eql?` and `:hash` for case-insensitive lookups,
|
10
|
+
# but it turns out not be that easy due to MRI's C comparison functions for String, Symbol, etc.
|
11
|
+
#
|
12
|
+
# It was super-confusing because I could call e.g. `'DOC'.eql? 'doc'` manually and get `true`,
|
13
|
+
# but it would always fail to work when used as a `Hash` key, when calling `uniq`, or in a `Set`:
|
14
|
+
#
|
15
|
+
# irb(main):049:1* Lol = Class.new(String).tap {
|
16
|
+
# irb(main):050:1* _1.define_method(:hash) do; self[0..5].downcase!.hash; end;
|
17
|
+
# irb(main):051:1* _1.define_method(:eql?) do |lol|; self[0..5].casecmp?(lol[0..5]); end;
|
18
|
+
# irb(main):052:1* _1.alias_method(:==, :eql?)
|
19
|
+
# irb(main):053:0> }
|
20
|
+
# irb(main):054:0> fart = Lol.new("abcdefg")
|
21
|
+
# irb(main):055:0> butt = Lol.new("abcdefgh")
|
22
|
+
# irb(main):056:0> fart == butt
|
23
|
+
# => true
|
24
|
+
# irb(main):057:0> fart.eql? butt
|
25
|
+
# => true
|
26
|
+
# irb(main):058:0> fart.hash
|
27
|
+
# => 1243221847611081438
|
28
|
+
# irb(main):059:0> butt.hash
|
29
|
+
# => 1243221847611081438
|
30
|
+
# irb(main):060:0> {fart => "smella"}[butt]
|
31
|
+
# => nil
|
32
|
+
# irb(main):061:0> {fart => "smella"}[fart]
|
33
|
+
# => "smella"
|
34
|
+
#
|
35
|
+
# I'm not the first to run into this, as I found when searching for `"rb_str_hash_cmp"`:
|
36
|
+
# https://kate.io/blog/strange-hash-instances-in-ruby/
|
37
|
+
#
|
38
|
+
# To work around this I will explicitly `downcase` the actual String subclass' value
|
39
|
+
# and just let the hashes collide for differently-cased values, then `eql?` will decide.
|
40
|
+
# This is still slower than the all-C String code but is the fastest method I've found
|
41
|
+
# to achieve this without doubling my Object allocations by wrapping each String in a Struct.
|
42
|
+
StickAround = Class.new(::String) do
|
43
|
+
|
44
|
+
# Be case-insensitive by default so we can match any filename.
|
45
|
+
DEFAULT_SENSITIVITY = false
|
46
|
+
|
47
|
+
# These may be weighted just like byte sequences.
|
48
|
+
include WeightedAction
|
49
|
+
|
50
|
+
# This class needs to support being instantiated without a value due to the way our XML data gets loaded,
|
51
|
+
# but the superclass `String` has a default `str=""` argument here that works perfectly for that need.
|
52
|
+
def initialize(str=-'', *args, case_sensitive: DEFAULT_SENSITIVITY, **kwargs)
|
53
|
+
# Prime `#replace` to treat its next `String` as case-sensitive iff we were told.
|
54
|
+
instance_variable_set(:@case_sensitive, case_sensitive) if case_sensitive == true
|
55
|
+
|
56
|
+
# Don't pass an initial `str` value to `super` if we were given one,
|
57
|
+
# because `#replace` has case-sensitivity-handling functionality that must be called.
|
58
|
+
super(str, *args, **kwargs)
|
59
|
+
self.replace(str) unless str.empty?
|
60
|
+
end
|
61
|
+
|
62
|
+
# Mark intent to be case-sensitive. Our source data's `<glob>` Attributes are parsed one at a time,
|
63
|
+
# so we won't know at the time of instantiation if we want to be case sensitive.
|
64
|
+
def case_sensitive=(sensitivity)
|
65
|
+
# Don't bother allocating an IVar if we're just going to be the default (case-insensitive)
|
66
|
+
if sensitivity == false then
|
67
|
+
remove_instance_variable(:@case_sensitive) if instance_variable_defined?(:@case_sensitive)
|
68
|
+
else
|
69
|
+
instance_variable_set(:@case_sensitive, sensitivity)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return our case-sensitive String variation iff we are marked case-sensitive *and* have a String value,
|
74
|
+
# otherwise just return our frozen deduplicated self value.
|
75
|
+
def itself
|
76
|
+
instance_variable_get(:@case_sensitive)&.is_a?(::String) ? instance_variable_get(:@case_sensitive) : self
|
77
|
+
end
|
78
|
+
|
79
|
+
def case_sensitive
|
80
|
+
instance_variable_get(:@case_sensitive)&.is_a?(::String) ? instance_variable_get(:@case_sensitive) : nil
|
81
|
+
end
|
82
|
+
|
83
|
+
# Set an appropriate value for ourselves given a variety of input.
|
84
|
+
# Even though this is called `#replace` here and in `String`, this method will often be used
|
85
|
+
# to set initial instance values due to nondeterministic attribute order while parsing our XML data.
|
86
|
+
def replace(otra, case_sensitive: DEFAULT_SENSITIVITY)
|
87
|
+
# Extract a usable value from different input types/formats.
|
88
|
+
#
|
89
|
+
# `File::extname` will return the last dotted component of a String, prepended with the leading dot,
|
90
|
+
# e.g. `File::extname("hello.jpg")` => `".jpg"`. We will prepend an asterisk to these to make a glob pattern.
|
91
|
+
#
|
92
|
+
# `File::extname` will be an empty String for input Strings which contain no dotted components
|
93
|
+
# or only have a leading dot, e.g. `File::extname(".bash_profile") => `""`.
|
94
|
+
newbuild = case otra
|
95
|
+
when self.class then -otra.to_s
|
96
|
+
when ::Symbol then -otra.name
|
97
|
+
when ::Pathname then otra.extname.empty? ? otra.basename.to_s.-@ : otra.extname.prepend(-?*).-@
|
98
|
+
when ::String then (File.extname(otra).empty? or -otra[-1] == -?*) ? -otra : -File.extname(otra).prepend(-?*)
|
99
|
+
else -otra.to_s
|
100
|
+
end
|
101
|
+
|
102
|
+
# The `super` call in this condition statement will explicitly set the `self` value to the downcased version of our key,
|
103
|
+
# but we will then compare `super`'s return value to its input to decide if we should store a case-sensitive value too.
|
104
|
+
#
|
105
|
+
# If the computed key is already downcase we could still be case-sensitive if we were boolean-marked as such,
|
106
|
+
# otherwise we have no need for the IVar and can remove it if one is set.
|
107
|
+
#
|
108
|
+
# Explicitly check if the IVar == `true`, not just truthiness, because it may also be a `String`
|
109
|
+
# if we are `#replace`ing a previous case-sensitive value.
|
110
|
+
#
|
111
|
+
# NOTE: There is a hole in the logic here where any non-downcased input will cause case-sensitivity,
|
112
|
+
# but this is necessary since our XML parsing might give us a `pattern` attribute callback
|
113
|
+
# before we'd had a chance to set a `case-insensitive` mark.
|
114
|
+
# All of the `case-sensitive="true"` `<glob>`s in current fd.o XML have an upper-case component,
|
115
|
+
# so this hack will make sure we don't discard the proper-cased `String` if we see that callback before the mark.
|
116
|
+
if (super(-newbuild.downcase(:fold)) != newbuild) or case_sensitive or (instance_variable_get(:@case_sensitive) == true) then
|
117
|
+
instance_variable_set(:@case_sensitive, newbuild)
|
118
|
+
else
|
119
|
+
remove_instance_variable(:@case_sensitive) if instance_variable_defined?(:@case_sensitive)
|
120
|
+
end
|
121
|
+
self # return the new downcased value we just set when we called `super`
|
122
|
+
end # replace
|
123
|
+
|
124
|
+
# Return a boolean describing our case-sensitivity status.
|
125
|
+
def case_sensitive?
|
126
|
+
# The same-name IVar could contain a (non-default) boolean value, but it's far more likely to contain
|
127
|
+
# the desired-case variation of the `self` String. In that case this returns `true` instead of the value.
|
128
|
+
case instance_variable_get(:@case_sensitive)
|
129
|
+
when ::String then true # We have stored a String case-variation.
|
130
|
+
when ::TrueClass then true # We have been marked for case-sensitivity next `#replace`.
|
131
|
+
else false # NilClass, FalseClass, or anything else.
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Returns case-optional boolean equality between this `StickAround` and a given object `StickAround` or `String`.
|
136
|
+
# This is one of two methods necessary for matching Hash keys, but this method will be called only if `self#hash`
|
137
|
+
# and `otra#hash` return the same Integer value, complicated by the fact that MRI's C implementation of `rb_str_hash_cmp`
|
138
|
+
# won't use our overriden version of `#hash`.
|
139
|
+
# That's why we downcase ourselves in `#replace` and store case variations separately.
|
140
|
+
def eql?(otra)
|
141
|
+
# https://ruby-doc.org/core/File.html#method-c-fnmatch-3F
|
142
|
+
#
|
143
|
+
# The `File` Class has kinda-poorly-documented Integer constants to control the behavior of `File::fnmatch?`.
|
144
|
+
# If this feels non-Ruby-ish it's because this is a POSIX thing:
|
145
|
+
# https://pubs.opengroup.org/onlinepubs/9699919799/functions/fnmatch.html
|
146
|
+
#
|
147
|
+
# irb(main):061:0> File::constants::keep_if { _1.to_s.include?('FNM_') }
|
148
|
+
# => [:FNM_CASEFOLD, :FNM_EXTGLOB, :FNM_SYSCASE, :FNM_NOESCAPE, :FNM_PATHNAME, :FNM_DOTMATCH, :FNM_SHORTNAME]
|
149
|
+
# irb(main):062:0> File::constants::keep_if { _1.to_s.include?('FNM_') }.map(&File::method(:const_get))
|
150
|
+
# => [8, 16, 0, 1, 2, 4, 0]
|
151
|
+
#
|
152
|
+
#
|
153
|
+
# - `File::FNM_PATHNAME` controls wildcards in the haystack matching `File::SEPARATOR` in the needle:
|
154
|
+
#
|
155
|
+
# irb> File.fnmatch?('*.jpg', '/hello.jpg', File::FNM_PATHNAME)
|
156
|
+
# => false
|
157
|
+
# irb> File.fnmatch?('*.jpg', '/hello.jpg')
|
158
|
+
# => true
|
159
|
+
# irb> File.fnmatch?('*.jpg', 'hello.jpg', File::FNM_PATHNAME)
|
160
|
+
# => true
|
161
|
+
# irb> File.fnmatch?('*.jpg', 'hello.jpg')
|
162
|
+
# => true
|
163
|
+
#
|
164
|
+
#
|
165
|
+
# - `File::FNM_DOTMATCH` controls wildcard in the haystack matching `.` in the needle, like *nix-style "hidden" files:
|
166
|
+
#
|
167
|
+
# irb> File.fnmatch?('*.jpg', '.hello.jpg', File::FNM_DOTMATCH)
|
168
|
+
# => true
|
169
|
+
# irb> File.fnmatch?('*.jpg', '.hello.jpg')
|
170
|
+
# => false
|
171
|
+
#
|
172
|
+
#
|
173
|
+
# - `File::FNM_EXTGLOB` controls support for brace-delimited glob syntax for haystacks:
|
174
|
+
#
|
175
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpg', File::FNM_EXTGLOB)
|
176
|
+
# => true
|
177
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpeg', File::FNM_EXTGLOB)
|
178
|
+
# => true
|
179
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpeg')
|
180
|
+
# => false
|
181
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpg')
|
182
|
+
# => false
|
183
|
+
#
|
184
|
+
#
|
185
|
+
# - `File::FNM_CASEFOLD` and `File::FNM_SYSCASE` control the case-sensitivity when matching,
|
186
|
+
# either by folding (explicit case-insensitivity) or by matching the behavior of the host operating system,
|
187
|
+
# *not* the behavior of any specific filesystem on that OS (https://bugs.ruby-lang.org/issues/15363),
|
188
|
+
# e.g. case-sensitive on BSD/Linux:
|
189
|
+
#
|
190
|
+
# irb> RUBY_PLATFORM
|
191
|
+
# => "x86_64-linux"
|
192
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense', File::FNM_SYSCASE)
|
193
|
+
# => false
|
194
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense', File::FNM_CASEFOLD)
|
195
|
+
# => true
|
196
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense')
|
197
|
+
# => false
|
198
|
+
#
|
199
|
+
#
|
200
|
+
# - `File::FNM_NOESCAPE` (ominously) controls matching escape sequences literally:
|
201
|
+
# https://github.com/ruby/ruby/blob/master/doc/syntax/literals.rdoc#label-Strings
|
202
|
+
#
|
203
|
+
# irb> File.fnmatch?("*.jpg\\", 'hello.jpg', File::FNM_NOESCAPE)
|
204
|
+
# => false
|
205
|
+
# irb> File.fnmatch?("*.jpg\\", 'hello.jpg')
|
206
|
+
# => true
|
207
|
+
#
|
208
|
+
#
|
209
|
+
# - `File::FNM_SHORTNAME` seems to control eight-dot-three filename matching, per the documentation:
|
210
|
+
# "Makes patterns to match short names if existing. Valid only on Microsoft Windows."
|
211
|
+
#
|
212
|
+
#
|
213
|
+
# - Multiple of these Integer Constants can be bitwise-`OR`ed together for simultaneous use:
|
214
|
+
#
|
215
|
+
# irb> File.fnmatch?('*.jp{e,}g', '/root/.HeLlO.jPEg', File::FNM_EXTGLOB | File::FNM_CASEFOLD | File::FNM_DOTMATCH)
|
216
|
+
# => true
|
217
|
+
# irb> File.fnmatch?('*.jp{e,}g', '/root/.HeLlO.jPEg', File::FNM_EXTGLOB | File::FNM_CASEFOLD | File::FNM_DOTMATCH | File::FNM_PATHNAME)
|
218
|
+
# => false
|
219
|
+
File.fnmatch?(
|
220
|
+
self.itself, # Haystack
|
221
|
+
otra.itself, # Needle
|
222
|
+
File::FNM_DOTMATCH |
|
223
|
+
File::FNM_EXTGLOB |
|
224
|
+
(
|
225
|
+
# Support testing `otra` as either another `StickAround` or as a plain `String`,
|
226
|
+
# in which case it will not have a method `#case_sensitive?`.
|
227
|
+
# Use our own case-sensitivity setting when comparing against plain `Strings`.
|
228
|
+
(self.case_sensitive? or (otra.respond_to?(:case_sensitive?) ? otra.case_sensitive? : self.case_sensitive?)) ?
|
229
|
+
0 : File::FNM_CASEFOLD
|
230
|
+
)
|
231
|
+
)
|
232
|
+
end # eql?
|
233
|
+
|
234
|
+
# Hash-key usage depends on `#eql?`, but `:==` should have identical behavior for our own uses.
|
235
|
+
alias_method(:==, :eql?)
|
236
|
+
|
237
|
+
# Return an Integer hash value for this object. This method and `#eql?` are used by `Hash`, `Set`, and `#uniq` to
|
238
|
+
# associate separate Objects with each other for deduplication or for use as `Hash` keys.
|
239
|
+
# The `eql?` method will be called only *after* two Integer `#hash` values match!
|
240
|
+
#
|
241
|
+
# NOTE: MRI will not use this function in many cases!
|
242
|
+
# It has C implementations of methods like `rb_str_hash_cmp` for `Hash` lookups, and this is usually a Good Thing™
|
243
|
+
# since it makes `Hash`es fast when using `String` or `Symbol` as keys.
|
244
|
+
# Subclassing built-in types like `String` allows/forces us to use these same accelerated code paths,
|
245
|
+
# and it was incredibly confusing for me why my custom String subclass was behaving so strangely
|
246
|
+
# when used as a Hash key until I had a hunch to read MRI's `string.c` and `hash.c` and confirmed.
|
247
|
+
# I found this write-up once I knew to search for "rb_str_hash_cmp": https://kate.io/blog/strange-hash-instances-in-ruby/
|
248
|
+
#
|
249
|
+
# I'm going to define this anyway because it could still be useful in certain corner cases, but be aware of the above!
|
250
|
+
# This is the reason I explicitly `downcase` our self value in `#replace`, because otherwise the Hash keys will never match
|
251
|
+
# and `#eql?` will never even be called.
|
252
|
+
def hash
|
253
|
+
if self.include?(-?*) and not self.start_with?(-?*) then self[...6].downcase!.hash
|
254
|
+
elsif self.include?(-?*) and not File.extname(self).empty? then File.extname(self).delete_prefix!(-?.)
|
255
|
+
else super
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end # StickAround
|
260
|
+
end # class CHECKING::YOU
|