checking-you-out 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +661 -0
- data/README.md +14 -0
- data/bin/are-we-unallocated-yet +9 -0
- data/bin/benchmark +66 -0
- data/bin/checking-you-out +8 -0
- data/bin/repl +7 -0
- data/bin/test-my-best +4 -0
- data/lib/checking-you-out.rb +40 -0
- data/lib/checking-you-out/auslandsgesprach.rb +253 -0
- data/lib/checking-you-out/ghost_revival.rb +71 -0
- data/lib/checking-you-out/ghost_revival/mr_mime.rb +390 -0
- data/lib/checking-you-out/ghost_revival/xross_infection.rb +146 -0
- data/lib/checking-you-out/inner_spirit.rb +215 -0
- data/lib/checking-you-out/party_starter.rb +202 -0
- data/lib/checking-you-out/party_starter/stick_around.rb +260 -0
- data/lib/checking-you-out/party_starter/weighted_action.rb +41 -0
- data/lib/checking-you-out/sweet_sweet_love_magic.rb +226 -0
- data/mime/packages/distorted-types.xml +68 -0
- data/mime/packages/third-party/shared-mime-info/freedesktop.org.xml.in +7672 -0
- data/mime/packages/third-party/tika-mimetypes/tika-mimetypes.xml +2762 -0
- metadata +232 -0
@@ -0,0 +1,215 @@
|
|
1
|
+
require 'set' unless defined? ::Set
|
2
|
+
require 'pathname' unless defined? ::Pathname
|
3
|
+
|
4
|
+
|
5
|
+
# Utility Modules/procs/lambdas/etc for generic operations like checking WeightedActions.
|
6
|
+
require_relative 'party_starter' unless defined? ::CHECKING::YOU::WeightedAction
|
7
|
+
|
8
|
+
|
9
|
+
# This base Struct will be used as the Hash key for its matching `OUT` subclass object,
|
10
|
+
# and its members correspond to the three major parts of an IETF "Content-Type" String,
|
11
|
+
# e.g. "application/x-saturn-rom" → :x, :application, :"saturn-rom".
|
12
|
+
#
|
13
|
+
# This is kind of a leaky abstraction since I want to support non-IETF type systems too,
|
14
|
+
# but the IETF system is by far the most relevant one to us because the most exhaustive
|
15
|
+
# source data (`shared-mime-info`) is based on that format and because, you know, Internet.
|
16
|
+
# See the adjacent `auslandsgespräch.rb` for the parser and more info.
|
17
|
+
#
|
18
|
+
#
|
19
|
+
# The instances of a `Struct` subclass with at most `RSTRUCT_EMBED_LEN_MAX` members
|
20
|
+
# can fit entirely within an `RStruct` without additional heap allocation.
|
21
|
+
# In MRI (at least as of 3.0) the `RSTRUCT_EMBED_LEN_MAX` is 3, so CYI uses three members.
|
22
|
+
#
|
23
|
+
# For more info see:
|
24
|
+
# - https://github.com/ruby/ruby/blob/master/gc.c
|
25
|
+
# - http://patshaughnessy.net/2013/2/8/ruby-mri-source-code-idioms-3-embedded-objects
|
26
|
+
CHECKING::YOU::IN ||= Struct.new(
|
27
|
+
# Intentionally avoiding naming taxonomic ranks like "domain", "class", or "order"
|
28
|
+
# whose names are already common in computing.
|
29
|
+
:kingdom,
|
30
|
+
:phylum,
|
31
|
+
:genus,
|
32
|
+
) do
|
33
|
+
# Promote any CYI to its CYO singleton. CYO has the opposites of these methods.
|
34
|
+
def out; ::CHECKING::YOU::OUT::new(self); end
|
35
|
+
def in; self; end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Main Struct subclass for in-memory type representation.
|
39
|
+
# Instances of the base `CHECKING::YOU::IN` Struct will refer to only one of these,
|
40
|
+
# and this matching object will contain all relevant data about the type,
|
41
|
+
# such as file extension(s), `magic` bytes, and variations of a base type like all of:
|
42
|
+
# - "application/vnd.wordperfect;"
|
43
|
+
# - "application/vnd.wordperfect;version=4.2"
|
44
|
+
# - "application/vnd.wordperfect;version=5.0"
|
45
|
+
# - "application/vnd.wordperfect;version=5.1"
|
46
|
+
# - "application/vnd.wordperfect;version=6.x"
|
47
|
+
# …will be represented in a single `CHECKING::YOU::OUT` object.
|
48
|
+
class ::CHECKING::YOU::OUT < ::CHECKING::YOU::IN
|
49
|
+
|
50
|
+
# Absolute path to the root of the Gem — the directory containing `bin`,`docs`,`lib`, etc.
|
51
|
+
GEM_ROOT = proc { ::Pathname.new(__dir__).join(*Array.new(2, -'..')).expand_path.realpath }
|
52
|
+
|
53
|
+
# Time object representing the day this running CYO Gem was packaged.
|
54
|
+
#
|
55
|
+
# `Gem::Specification#date` can be slightly misleading when developing locally with Bundler using `bundle exec`.
|
56
|
+
# One might expect the result of `#date` to be "now" (including hours/minutes/seconds) in UTC for such a runtime-packaged Gem,
|
57
|
+
# but it will always be midnight UTC of the current day (also in UTC), i.e. a date that is always[0] in the past.
|
58
|
+
#
|
59
|
+
# After ${your-UTC-offset} hours before midnight localtime, this will give you a *day* that seems to be in the future
|
60
|
+
# compared to a system clock displaying localtime despite that *date* UTC still being in the past,
|
61
|
+
# e.g. as I write this comment at 2021-05-25 22:22 PST, `GEM_PACKAGE_TIME.call` returns `2021-05-26 00:00:00 UTC`.
|
62
|
+
#
|
63
|
+
# Rescue from `Gem::MissingSpecError`'s parent to support developing locally with just `require_relative` and no Bundler.
|
64
|
+
#
|
65
|
+
# [0]: unless you manage to `bundle exec` at exactly 00:00:00 UTC :)
|
66
|
+
GEM_PACKAGE_TIME = proc { begin; Gem::Specification::find_by_name(-'checking-you-out').date; rescue Gem::LoadError; Time.now; end }
|
67
|
+
|
68
|
+
Species = Struct.new(:name, :value) do
|
69
|
+
def self.from_string(param_string)
|
70
|
+
return self.new(*param_string.split(-?=))
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Main memoization Hash for our loaded Type data.
|
75
|
+
# { CHECKING::YOU::IN => CHECKING::YOU::OUT }
|
76
|
+
def self.all_night; @all_night ||= Hash.new(nil); end
|
77
|
+
|
78
|
+
# Return a singleton instance for any CYO.
|
79
|
+
def self.new(taxa)
|
80
|
+
# Support IETF String argument to this method, e.g. ::CHECKING::YOU::OUT::new('application/octet-stream')
|
81
|
+
return self.from_ietf_media_type(taxa) if taxa.is_a?(String)
|
82
|
+
# Otherwise return the memoized CYO singleton of this type.
|
83
|
+
self.all_night[
|
84
|
+
taxa.is_a?(::CHECKING::YOU::IN) ? taxa : super(*taxa)
|
85
|
+
] ||= self.allocate.tap { |cyo| cyo.send(:initialize, *taxa) }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Demote any CYO to a CYI that can be passed around in just 40 bytes.
|
89
|
+
# CYI has the opposites of these methods.
|
90
|
+
def out; self; end
|
91
|
+
def in; self.class.all_night.key(self); end
|
92
|
+
|
93
|
+
|
94
|
+
# Get a CYO, Set[CYO], or nil by file-extension, e.g. `doc` => { CYO msword, CYO rtf }.
|
95
|
+
POSTFIX_KEY = proc {
|
96
|
+
# Re-use a single search structure to avoid allocating an Object per search.
|
97
|
+
scratch = ::CHECKING::YOU::StickAround.new(-'')
|
98
|
+
# Additionally accelerate multiple searches for the same thing by avoiding `StickAround#replace`
|
99
|
+
# if the new search key already matches the previous search key.
|
100
|
+
# Mark `case_sensitive: false` here for testing arbitrarily-named input.
|
101
|
+
-> { scratch.eql?(_1) ? scratch : scratch.replace(_1, case_sensitive: false) }
|
102
|
+
}.call
|
103
|
+
def self.from_postfix(stick_around)
|
104
|
+
self.instance_variable_get(:@after_forever)[POSTFIX_KEY.call(stick_around)]
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get a Hash[CYO] or nil for arbitrary non-file-extension glob match of a File basename.
|
108
|
+
def self.from_glob(stick_around)
|
109
|
+
self.instance_variable_get(:@stick_around).select { |k,v|
|
110
|
+
k.eql?(stick_around)
|
111
|
+
}.yield_self { |matched|
|
112
|
+
matched.empty? ? nil : matched
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.from_pathname(pathname)
|
117
|
+
return self.from_glob(pathname) || self.from_postfix(pathname)
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
# Add a new Postfix or Glob for a specific type.
|
122
|
+
def add_pathname_fragment(fragment)
|
123
|
+
if fragment.start_with?(-'*.') and fragment.count(-?.) == 1 and fragment.count(-?*) == 1 then
|
124
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@postfixes, fragment, self)
|
125
|
+
::CHECKING::YOU::CLASS_NEEDLEMAKER.call(:@after_forever, fragment, self)
|
126
|
+
else
|
127
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@globs, fragment, self)
|
128
|
+
::CHECKING::YOU::CLASS_NEEDLEMAKER.call(:@stick_around, fragment, self)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# Get a `Set` of this CYO and all of its parent CYOs, at minimum just `Set[self]`.
|
134
|
+
def aka
|
135
|
+
return case @aka
|
136
|
+
when nil then Set[self.in]
|
137
|
+
when self.class, self.class.superclass then Set[self.in, @aka]
|
138
|
+
when ::Set then Set[self.in, *@aka]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Take an additional CYI, store it locally, and memoize it as an alias for this CYO.
|
143
|
+
def add_aka(taxa)
|
144
|
+
taxa = taxa.is_a?(::CHECKING::YOU::IN) ? taxa : self.class.superclass.new(*taxa)
|
145
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@aka, taxa, self)
|
146
|
+
self.class.all_night[taxa] = self
|
147
|
+
end
|
148
|
+
|
149
|
+
# Forget a CYI alias of this Type. Capable of unsetting the "real" CYI as well if desired.
|
150
|
+
def remove_aka(taxa)
|
151
|
+
taxa = taxa.is_a?(::CHECKING::YOU::IN) ? taxa : self.class.superclass.new(*taxa)
|
152
|
+
self.class.all_night.delete(taxa) if self.class.all_night[taxa] === self
|
153
|
+
end
|
154
|
+
|
155
|
+
attr_reader :parents, :children
|
156
|
+
|
157
|
+
# Take an additional CYO, store it locally as our parent, and ask it to add ourselves as its child.
|
158
|
+
def add_parent(parent_cyo)
|
159
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@parents, parent_cyo, self)
|
160
|
+
parent_cyo.add_child(self) unless parent_cyo.children&.include?(self)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Take an additional CYO, store it locally as our child, and ask it to add ourselves as its parent.
|
164
|
+
def add_child(child_cyo)
|
165
|
+
::CHECKING::YOU::INSTANCE_NEEDLEMAKER.call(:@children, child_cyo, self)
|
166
|
+
child_cyo.add_parent(self) unless child_cyo.parents&.include?(self)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Get a `Set` of this CYO and all of its parent CYOs, at minimum just `Set[self]`.
|
170
|
+
def adults_table
|
171
|
+
return case @parents
|
172
|
+
when nil then Set[self]
|
173
|
+
when self.class, self.class.superclass then Set[self, @parents]
|
174
|
+
when ::Set then Set[self, *@parents]
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# Get a `Set` of this CYO and all of its child CYOs, at minimum just `Set[self]`.
|
179
|
+
def kids_table
|
180
|
+
return case @children
|
181
|
+
when nil then Set[self]
|
182
|
+
when self.class, self.class.superclass then Set[self, @children]
|
183
|
+
when ::Set then Set[self, *@children]
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
# Get a `Set` of this CYO and all parents and children, at minimum just `Set[self]`.
|
188
|
+
def family_tree; self.kids_table | self.adults_table; end
|
189
|
+
|
190
|
+
# Storage for descriptions (`<comment>`), acrnyms, suitable iconography, and other boring metadata, e.g.:
|
191
|
+
# <mime-type type="application/vnd.oasis.opendocument.text">
|
192
|
+
# <comment>ODT document</comment>
|
193
|
+
# <acronym>ODT</acronym>
|
194
|
+
# <expanded-acronym>OpenDocument Text</expanded-acronym>
|
195
|
+
# <generic-icon name="x-office-document"/>
|
196
|
+
# […]
|
197
|
+
# </mini-type>
|
198
|
+
attr_accessor :description
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
# IETF Media-Type parser and methods that use that parser.
|
203
|
+
require_relative 'auslandsgesprach' unless defined? ::CHECKING::YOU::IN::AUSLANDSGESPRÄCH
|
204
|
+
::CHECKING::YOU::IN.extend(::CHECKING::YOU::IN::AUSLANDSGESPRÄCH)
|
205
|
+
::CHECKING::YOU::IN.include(::CHECKING::YOU::IN::INLANDGESPRÄCH)
|
206
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::OUT::AUSLANDSGESPRÄCH)
|
207
|
+
|
208
|
+
# Content matching à la `libmagic`/`file`.
|
209
|
+
require_relative 'sweet_sweet_love_magic' unless defined? ::CHECKING::YOU::SweetSweet♥Magic
|
210
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::SweetSweet♡Magic)
|
211
|
+
::CHECKING::YOU::OUT.prepend(::CHECKING::YOU::SweetSweet♥Magic)
|
212
|
+
|
213
|
+
# Methods for loading type data from `shared-mime-info` package XML files.
|
214
|
+
require_relative 'ghost_revival' unless defined? ::CHECKING::YOU::GHOST_REVIVAL
|
215
|
+
::CHECKING::YOU::OUT.extend(::CHECKING::YOU::OUT::GHOST_REVIVAL)
|
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'set' unless defined? ::Set
|
2
|
+
require 'pathname' unless defined? ::Pathname
|
3
|
+
|
4
|
+
|
5
|
+
# This file defines/imports various utility Modules/procs/etc that should be available
|
6
|
+
# to all other CYO components without `including`/`extending`.
|
7
|
+
require_relative 'party_starter/weighted_action' unless defined? ::CHECKING::YOU::WeightedAction
|
8
|
+
require_relative 'party_starter/stick_around' unless defined? ::CHECKING::YOU::StickAround
|
9
|
+
|
10
|
+
class CHECKING::YOU
|
11
|
+
|
12
|
+
# The following two `proc`s handle classwide-memoization and instance-level assignment
|
13
|
+
# for values that may be Enumerable but often refer to only a single Object.
|
14
|
+
#
|
15
|
+
# For example, most `Postfix`es (file extensions) will only ever belong to a single CYO Object,
|
16
|
+
# but a handful represent possibly-multiple types, like how `.doc` can be an MSWord file or WordPad RTF.
|
17
|
+
#
|
18
|
+
# These assignment procs take a storage haystack, a needle to store, and the CYO receiver to which the needle refers.
|
19
|
+
# They will set `haystack[needle] => CYO` if that needle is unique and unset, or they will convert
|
20
|
+
# an existing single `haystack[needle] => CYO` assignment to `haystack[needle] => Set[existingCYO, newCYO]`.
|
21
|
+
#
|
22
|
+
# This is an admittedly-annoying complexity-for-performance tradeoff with the goal of allocating
|
23
|
+
# as few spurious containers as possible instead of explicitly initializing a Set for every needle
|
24
|
+
# when most of them would wastefully be a Set of just a single thing.
|
25
|
+
CLASS_NEEDLEMAKER = proc { |haystack, needle, receiver|
|
26
|
+
# Create the container if this is the very first invocation.
|
27
|
+
receiver.class.instance_variable_set(haystack, Hash.new(nil)) unless receiver.class.instance_variable_defined?(haystack)
|
28
|
+
|
29
|
+
# Set the `haystack` Hash's `needle` key to the `receiver` if the `key` is unset, otherwise
|
30
|
+
# to a `Set` of the existing value plus `receiver` if that value is not `receiver` already.
|
31
|
+
receiver.class.instance_variable_get(haystack).tap { |awen|
|
32
|
+
case awen[needle]
|
33
|
+
when nil then awen[needle] = receiver
|
34
|
+
when ::Set then awen[needle].add(receiver)
|
35
|
+
when receiver.class then awen[needle] = Set[awen[needle], receiver] unless awen[needle] == receiver
|
36
|
+
end
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
# This is the instance-level version of the above, e.g. a CYO with only one Postfix
|
41
|
+
# will assign `cyo.:@postfixes = Postfix`, and a CYO with many Postfixes will assign
|
42
|
+
# e.g. `cyo.:@postfixes = Set[post, fix, es, …]`.
|
43
|
+
INSTANCE_NEEDLEMAKER = proc { |haystack, needle, receiver|
|
44
|
+
if receiver.instance_variable_defined?(haystack) then
|
45
|
+
receiver.instance_variable_get(haystack).add(needle)
|
46
|
+
else
|
47
|
+
receiver.instance_variable_set(haystack, Set[needle])
|
48
|
+
end
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
# Test a Pathname representing an extant file whose contents and metadata we can use.
|
53
|
+
# This is separated into a lambda due to the complexity, since the entry-point might
|
54
|
+
# be given a String that could represent a Media Type, a hypothetical path,
|
55
|
+
# an extant path, or even raw stream contents. It could be given a Pathname representing
|
56
|
+
# either a hypothetical or extant file. It could be given an IO/Stream object.
|
57
|
+
# Several input possibilities will end up callin this lambda.
|
58
|
+
#
|
59
|
+
# Some of this complexity is my fault, since I'm doing a lot of variable juggling
|
60
|
+
# to avoid as many new-Object-allocations as possible in the name of performance
|
61
|
+
# since this library is the very core-est core of DistorteD;
|
62
|
+
# things like assigning Hash values to single CYO objects the first time that key is stored
|
63
|
+
# then replacing that value with a Set iff that key needs to reference any additional CYO.
|
64
|
+
#
|
65
|
+
# - `::from_xattr` can return `nil` or a single `CYO` depending on filesystem extended attributes.
|
66
|
+
# It is very very unlikely that most people will ever use this, but I think it's cool 8)
|
67
|
+
#
|
68
|
+
# - `::from_postfix` can return `nil`, `CYO`, or `Set` since I decided to store Postfixes
|
69
|
+
# separately from freeform globs since file-extension matches are the vast majority of globs.
|
70
|
+
# Postfixes avoid needing to be weighted since they all represent the same final pathname component
|
71
|
+
# and should never result in multiple conflicting Postfix key matches.
|
72
|
+
# A single Postfix key can represent multiple CYOs, though; hence the possible `Set`.
|
73
|
+
#
|
74
|
+
# - `::from_glob` can return `nil` or `Hash` since even a single match will include the weighted key.
|
75
|
+
#
|
76
|
+
# - `::from_content` can return `nil` or `Hash` based on a `libmagic`-style match of file/stream contents.
|
77
|
+
# Many common types can be determined from the first four bytes alone, but we support matching
|
78
|
+
# arbitrarily-long sequences against arbitrarily-big byte range boundaries.
|
79
|
+
# These keys will also be weighted, even for a single match.
|
80
|
+
TEST_EXTANT_PATHNAME = -> (pathname, so_deep: true, only_one_match: true) {
|
81
|
+
|
82
|
+
# Never return empty Enumerables.
|
83
|
+
# Yielding-self to this proc will `nil`-ify anything that's `:empty?`
|
84
|
+
# and will pass any non-Enumerable Objects through.
|
85
|
+
point_zero = proc { _1.respond_to?(:empty) ? (_1.empty? ? nil : _1) : _1 }
|
86
|
+
|
87
|
+
# Our matching block will return a single CYO when possible, and can optionally
|
88
|
+
# return multiple CYO matches for ambiguous files/streams.
|
89
|
+
# Multiple matching must be opted into with `only_one_match: false` so it doesn't need to be
|
90
|
+
# checked by every caller that's is fine with best-effort and wants to minimize allocations.
|
91
|
+
one_or_eight = proc { |huh|
|
92
|
+
case
|
93
|
+
when huh.nil? then nil
|
94
|
+
when huh.respond_to?(:empty?), huh.respond_to?(:first?)
|
95
|
+
if huh.empty? then nil
|
96
|
+
elsif huh.size == 1 then huh.is_a?(::Hash) ? huh.values.first : huh.first
|
97
|
+
elsif huh.size > 1 and only_one_match then huh.is_a?(::Hash) ? huh.values.first : huh.first
|
98
|
+
else huh
|
99
|
+
end
|
100
|
+
else huh
|
101
|
+
end
|
102
|
+
}
|
103
|
+
|
104
|
+
# Test all "glob" matches against all child Types of all "magic" matches to allow for
|
105
|
+
# nuanced detection of ambiguous streams where a `magic` match returns multiple possibilities,
|
106
|
+
# e.g. using a `.doc` Postfix-match to choose a `text-plain` glob-match for non-Word `.doc` files
|
107
|
+
# or to choose a `application/msword` glob-match over a more generic `application/x-ole-storage`
|
108
|
+
# magic-match when the magic weights alone are not enough information to make the correct choice.
|
109
|
+
# irb> ::CHECKING::YOU::OUT::from_postfix('doc')
|
110
|
+
# => #<Set: {#<CHECKING::YOU::OUT application/msword>, #<CHECKING::YOU::OUT text/plain>}>
|
111
|
+
#
|
112
|
+
# Again, a lot of the complexity here is "my fault" in that I could avoid it by explicitly using
|
113
|
+
# the same data structures for all the different inputs, but I need this to be as fast
|
114
|
+
# and as low-overhead as possible which means avoiding allocations of things like
|
115
|
+
# Enumerables that end up holding only a single other object.
|
116
|
+
# Obviously that leads to a lot of variation in result values from helper methods,
|
117
|
+
# so I'll own that here instead of ever making callsites deal with it.
|
118
|
+
#
|
119
|
+
# This `proc`'s output will introduce a little more of that same complexity since it will be `nil`
|
120
|
+
# if either input is `nil`, will be a single CYO if there is only one union match,
|
121
|
+
# or a `Set` if there are still multiple possibilities.
|
122
|
+
magic_children = proc { |glob, magic|
|
123
|
+
# NOTE: CYO deviates from `shared-mime-info`'s behavior very slightly here!
|
124
|
+
#
|
125
|
+
# `shared-mime-info`'s "Recommended checking order" documentation sez:
|
126
|
+
# "If any of the mimetypes resulting from a glob match is equal to or a subclass of the result
|
127
|
+
# from the magic sniffing, use this as the result. This allows us for example to distinguish text files
|
128
|
+
# called 'foo.doc' from MS-Word files with the same name, as the magic match for the MS-Word file would be
|
129
|
+
# `application/x-ole-storage` which the MS-Word type inherits."
|
130
|
+
#
|
131
|
+
# Our behavior is identical except it allows glob matches which are a *superclass* of a
|
132
|
+
# magic-match in addition to subclass or equal-to, i.e. using `:family_tree` for comparison here
|
133
|
+
# instead of using `:kids_table`. There might be a downside to this that I haven't found yet
|
134
|
+
# but it allows CYO to better match some things, e.g. matching a `'.flv'` video file as
|
135
|
+
# `'video/x-flv'` instead of as `'video/x-javafx'`, since fd.o has the latter as a subclass of the former.
|
136
|
+
case [glob, magic]
|
137
|
+
in ::NilClass, * then nil
|
138
|
+
in *, ::NilClass then nil
|
139
|
+
in ::Set, ::Hash then glob & magic.values.to_set.map(&:family_tree).reduce(&:&)
|
140
|
+
in ::Set, ::CHECKING::YOU::OUT then glob & magic.kids_table
|
141
|
+
in ::Hash, ::Hash then glob.values.to_set & magic.values.to_set.map(&:family_tree).reduce(&:&)
|
142
|
+
in ::CHECKING::YOU::OUT, ::Hash then magic.values.to_set.map(&:family_tree).reduce(&:&)&.include?(glob) ? glob : nil
|
143
|
+
in ::Hash, ::CHECKING::YOU::OUT then glob.values.to_set & magic.kids_table
|
144
|
+
in ::CHECKING::YOU::OUT, ::CHECKING::YOU::OUT then glob == magic ? glob : nil
|
145
|
+
else nil
|
146
|
+
end.yield_self(&point_zero)
|
147
|
+
}
|
148
|
+
|
149
|
+
# "If a MIME type is provided explicitly (eg, by a ContentType HTTP header, a MIME email attachment,
|
150
|
+
# an extended attribute or some other means) then that should be used instead of guessing."
|
151
|
+
# This will probably always be `nil` since this is a niche feature, but we have to test it first.
|
152
|
+
::CHECKING::YOU::OUT::from_xattr(pathname) || begin
|
153
|
+
|
154
|
+
# "Start by doing a glob match of the filename. Keep only globs with the biggest weight."
|
155
|
+
# "If the patterns are different, keep only matched with the longest pattern."
|
156
|
+
# If after this, there is one or more matching glob, and all the matching globs result in
|
157
|
+
# the same mimetype, use that mimetype as the result."
|
158
|
+
# This can be `nil`, `CYO`, a `Set` of Postfix matches, or a `Hash` of weighted Glob matches.
|
159
|
+
glob_matched = ::CHECKING::YOU::OUT::from_pathname(pathname)
|
160
|
+
|
161
|
+
# "If the glob matching fails or results in multiple conflicting mimetypes,
|
162
|
+
# read the contents of the file and do magic sniffing on it.
|
163
|
+
# This can be `nil` or a `Hash` of weighted magic matches.
|
164
|
+
magic_matched = (glob_matched.nil? || glob_matched.is_a?(Enumerable) || so_deep) ? ::CHECKING::YOU::OUT::from_content(pathname) : nil
|
165
|
+
|
166
|
+
# Make a decision based on the two possible matches above plus a third match category
|
167
|
+
# based on a union between the glob match and all children of all magic matches.
|
168
|
+
# See the relevant proc above. Its result will always be `nil` if either input is `nil`.
|
169
|
+
#
|
170
|
+
# "If there was no glob match, use the magic match as the result."
|
171
|
+
# "Otherwise use the result of the glob match that has the highest weight."
|
172
|
+
return case [glob_matched, magic_matched, magic_children.call(glob_matched, magic_matched)]
|
173
|
+
in ::NilClass, ::Hash, ::NilClass then LEGENDARY_HEAVY_GLOW.call(magic_matched, :weight)
|
174
|
+
in ::CHECKING::YOU::OUT, ::NilClass, ::NilClass then glob_matched
|
175
|
+
in ::Set, ::NilClass, ::NilClass then glob_matched
|
176
|
+
in ::Hash, ::NilClass, ::NilClass then LEGENDARY_HEAVY_GLOW.call(glob_matched, [:weight, :length])
|
177
|
+
in *, ::CHECKING::YOU::OUT => only_one_type then only_one_type
|
178
|
+
in ::Set, ::Hash, ::Set => magic_children then
|
179
|
+
# Choose the union-matched type having the the heaviest magic-matched weight.
|
180
|
+
LEGENDARY_HEAVY_GLOW.call(magic_matched.keep_if { |_magic, cyo| magic_children.include?(cyo) }, :weight)
|
181
|
+
in ::Hash, ::Hash, ::Set => magic_children then
|
182
|
+
# Choose the union-matched type having the heaviest glob-matched weight,
|
183
|
+
# and then additionally the longest glob string if there are still multiple matches.
|
184
|
+
LEGENDARY_HEAVY_GLOW.call(glob_matched.keep_if { |_glob, cyo| magic_children.include?(cyo) }, [:weight, :length])
|
185
|
+
in ::CHECKING::YOU::OUT, ::Hash, ::NilClass then glob_matched
|
186
|
+
in ::CHECKING::YOU::OUT, ::Hash, ::Set => magic_children then
|
187
|
+
# Choose the single glob-matched type iff it was also magic-matched,
|
188
|
+
# otherwise choose the heaviest magic-matched type.
|
189
|
+
magic_matched.values.include?(glob_matched) ? glob_matched : LEGENDARY_HEAVY_GLOW.call(magic_matched, :weight)
|
190
|
+
in ::NilClass, ::NilClass, ::NilClass then
|
191
|
+
# "If no magic rule matches the data (or if the content is not available),
|
192
|
+
# use the default type of application/octet-stream for binary data, or text/plain for textual data."
|
193
|
+
# "Note: Checking the first 128 bytes of the file for ASCII control characters is a good way to guess
|
194
|
+
# whether a file is binary or text, but note that files with high-bit-set characters should still be
|
195
|
+
# treated as text since these can appear in UTF-8 text, unlike control characters.
|
196
|
+
::CHECKING::YOU::OUT::from_ietf_media_type('application/octet-stream')
|
197
|
+
else nil
|
198
|
+
end.yield_self(&one_or_eight)
|
199
|
+
end # ::CHECKING::YOU::OUT::from_xattr(pathname) || begin
|
200
|
+
} # TEST_EXTANT_PATHNAME
|
201
|
+
|
202
|
+
end # class CHECKING::YOU
|
@@ -0,0 +1,260 @@
|
|
1
|
+
require 'pathname' unless defined?(::Pathname)
|
2
|
+
|
3
|
+
|
4
|
+
class CHECKING::YOU
|
5
|
+
# Provide case-optional String-like keys for Postfixes, Globs, etc.
|
6
|
+
#
|
7
|
+
# From Ruby's `Hash` docs: "Two objects refer to the same hash key when their hash value is identical
|
8
|
+
# and the two objects are eql? to each other"
|
9
|
+
# I tried to subclass String and just override `:eql?` and `:hash` for case-insensitive lookups,
|
10
|
+
# but it turns out not be that easy due to MRI's C comparison functions for String, Symbol, etc.
|
11
|
+
#
|
12
|
+
# It was super-confusing because I could call e.g. `'DOC'.eql? 'doc'` manually and get `true`,
|
13
|
+
# but it would always fail to work when used as a `Hash` key, when calling `uniq`, or in a `Set`:
|
14
|
+
#
|
15
|
+
# irb(main):049:1* Lol = Class.new(String).tap {
|
16
|
+
# irb(main):050:1* _1.define_method(:hash) do; self[0..5].downcase!.hash; end;
|
17
|
+
# irb(main):051:1* _1.define_method(:eql?) do |lol|; self[0..5].casecmp?(lol[0..5]); end;
|
18
|
+
# irb(main):052:1* _1.alias_method(:==, :eql?)
|
19
|
+
# irb(main):053:0> }
|
20
|
+
# irb(main):054:0> fart = Lol.new("abcdefg")
|
21
|
+
# irb(main):055:0> butt = Lol.new("abcdefgh")
|
22
|
+
# irb(main):056:0> fart == butt
|
23
|
+
# => true
|
24
|
+
# irb(main):057:0> fart.eql? butt
|
25
|
+
# => true
|
26
|
+
# irb(main):058:0> fart.hash
|
27
|
+
# => 1243221847611081438
|
28
|
+
# irb(main):059:0> butt.hash
|
29
|
+
# => 1243221847611081438
|
30
|
+
# irb(main):060:0> {fart => "smella"}[butt]
|
31
|
+
# => nil
|
32
|
+
# irb(main):061:0> {fart => "smella"}[fart]
|
33
|
+
# => "smella"
|
34
|
+
#
|
35
|
+
# I'm not the first to run into this, as I found when searching for `"rb_str_hash_cmp"`:
|
36
|
+
# https://kate.io/blog/strange-hash-instances-in-ruby/
|
37
|
+
#
|
38
|
+
# To work around this I will explicitly `downcase` the actual String subclass' value
|
39
|
+
# and just let the hashes collide for differently-cased values, then `eql?` will decide.
|
40
|
+
# This is still slower than the all-C String code but is the fastest method I've found
|
41
|
+
# to achieve this without doubling my Object allocations by wrapping each String in a Struct.
|
42
|
+
StickAround = Class.new(::String) do
|
43
|
+
|
44
|
+
# Be case-insensitive by default so we can match any filename.
|
45
|
+
DEFAULT_SENSITIVITY = false
|
46
|
+
|
47
|
+
# These may be weighted just like byte sequences.
|
48
|
+
include WeightedAction
|
49
|
+
|
50
|
+
# This class needs to support being instantiated without a value due to the way our XML data gets loaded,
|
51
|
+
# but the superclass `String` has a default `str=""` argument here that works perfectly for that need.
|
52
|
+
def initialize(str=-'', *args, case_sensitive: DEFAULT_SENSITIVITY, **kwargs)
|
53
|
+
# Prime `#replace` to treat its next `String` as case-sensitive iff we were told.
|
54
|
+
instance_variable_set(:@case_sensitive, case_sensitive) if case_sensitive == true
|
55
|
+
|
56
|
+
# Don't pass an initial `str` value to `super` if we were given one,
|
57
|
+
# because `#replace` has case-sensitivity-handling functionality that must be called.
|
58
|
+
super(str, *args, **kwargs)
|
59
|
+
self.replace(str) unless str.empty?
|
60
|
+
end
|
61
|
+
|
62
|
+
# Mark intent to be case-sensitive. Our source data's `<glob>` Attributes are parsed one at a time,
|
63
|
+
# so we won't know at the time of instantiation if we want to be case sensitive.
|
64
|
+
def case_sensitive=(sensitivity)
|
65
|
+
# Don't bother allocating an IVar if we're just going to be the default (case-insensitive)
|
66
|
+
if sensitivity == false then
|
67
|
+
remove_instance_variable(:@case_sensitive) if instance_variable_defined?(:@case_sensitive)
|
68
|
+
else
|
69
|
+
instance_variable_set(:@case_sensitive, sensitivity)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return our case-sensitive String variation iff we are marked case-sensitive *and* have a String value,
|
74
|
+
# otherwise just return our frozen deduplicated self value.
|
75
|
+
def itself
|
76
|
+
instance_variable_get(:@case_sensitive)&.is_a?(::String) ? instance_variable_get(:@case_sensitive) : self
|
77
|
+
end
|
78
|
+
|
79
|
+
def case_sensitive
|
80
|
+
instance_variable_get(:@case_sensitive)&.is_a?(::String) ? instance_variable_get(:@case_sensitive) : nil
|
81
|
+
end
|
82
|
+
|
83
|
+
# Set an appropriate value for ourselves given a variety of input.
|
84
|
+
# Even though this is called `#replace` here and in `String`, this method will often be used
|
85
|
+
# to set initial instance values due to nondeterministic attribute order while parsing our XML data.
|
86
|
+
def replace(otra, case_sensitive: DEFAULT_SENSITIVITY)
|
87
|
+
# Extract a usable value from different input types/formats.
|
88
|
+
#
|
89
|
+
# `File::extname` will return the last dotted component of a String, prepended with the leading dot,
|
90
|
+
# e.g. `File::extname("hello.jpg")` => `".jpg"`. We will prepend an asterisk to these to make a glob pattern.
|
91
|
+
#
|
92
|
+
# `File::extname` will be an empty String for input Strings which contain no dotted components
|
93
|
+
# or only have a leading dot, e.g. `File::extname(".bash_profile") => `""`.
|
94
|
+
newbuild = case otra
|
95
|
+
when self.class then -otra.to_s
|
96
|
+
when ::Symbol then -otra.name
|
97
|
+
when ::Pathname then otra.extname.empty? ? otra.basename.to_s.-@ : otra.extname.prepend(-?*).-@
|
98
|
+
when ::String then (File.extname(otra).empty? or -otra[-1] == -?*) ? -otra : -File.extname(otra).prepend(-?*)
|
99
|
+
else -otra.to_s
|
100
|
+
end
|
101
|
+
|
102
|
+
# The `super` call in this condition statement will explicitly set the `self` value to the downcased version of our key,
|
103
|
+
# but we will then compare `super`'s return value to its input to decide if we should store a case-sensitive value too.
|
104
|
+
#
|
105
|
+
# If the computed key is already downcase we could still be case-sensitive if we were boolean-marked as such,
|
106
|
+
# otherwise we have no need for the IVar and can remove it if one is set.
|
107
|
+
#
|
108
|
+
# Explicitly check if the IVar == `true`, not just truthiness, because it may also be a `String`
|
109
|
+
# if we are `#replace`ing a previous case-sensitive value.
|
110
|
+
#
|
111
|
+
# NOTE: There is a hole in the logic here where any non-downcased input will cause case-sensitivity,
|
112
|
+
# but this is necessary since our XML parsing might give us a `pattern` attribute callback
|
113
|
+
# before we'd had a chance to set a `case-insensitive` mark.
|
114
|
+
# All of the `case-sensitive="true"` `<glob>`s in current fd.o XML have an upper-case component,
|
115
|
+
# so this hack will make sure we don't discard the proper-cased `String` if we see that callback before the mark.
|
116
|
+
if (super(-newbuild.downcase(:fold)) != newbuild) or case_sensitive or (instance_variable_get(:@case_sensitive) == true) then
|
117
|
+
instance_variable_set(:@case_sensitive, newbuild)
|
118
|
+
else
|
119
|
+
remove_instance_variable(:@case_sensitive) if instance_variable_defined?(:@case_sensitive)
|
120
|
+
end
|
121
|
+
self # return the new downcased value we just set when we called `super`
|
122
|
+
end # replace
|
123
|
+
|
124
|
+
# Return a boolean describing our case-sensitivity status.
|
125
|
+
def case_sensitive?
|
126
|
+
# The same-name IVar could contain a (non-default) boolean value, but it's far more likely to contain
|
127
|
+
# the desired-case variation of the `self` String. In that case this returns `true` instead of the value.
|
128
|
+
case instance_variable_get(:@case_sensitive)
|
129
|
+
when ::String then true # We have stored a String case-variation.
|
130
|
+
when ::TrueClass then true # We have been marked for case-sensitivity next `#replace`.
|
131
|
+
else false # NilClass, FalseClass, or anything else.
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Returns case-optional boolean equality between this `StickAround` and a given object `StickAround` or `String`.
|
136
|
+
# This is one of two methods necessary for matching Hash keys, but this method will be called only if `self#hash`
|
137
|
+
# and `otra#hash` return the same Integer value, complicated by the fact that MRI's C implementation of `rb_str_hash_cmp`
|
138
|
+
# won't use our overriden version of `#hash`.
|
139
|
+
# That's why we downcase ourselves in `#replace` and store case variations separately.
|
140
|
+
def eql?(otra)
|
141
|
+
# https://ruby-doc.org/core/File.html#method-c-fnmatch-3F
|
142
|
+
#
|
143
|
+
# The `File` Class has kinda-poorly-documented Integer constants to control the behavior of `File::fnmatch?`.
|
144
|
+
# If this feels non-Ruby-ish it's because this is a POSIX thing:
|
145
|
+
# https://pubs.opengroup.org/onlinepubs/9699919799/functions/fnmatch.html
|
146
|
+
#
|
147
|
+
# irb(main):061:0> File::constants::keep_if { _1.to_s.include?('FNM_') }
|
148
|
+
# => [:FNM_CASEFOLD, :FNM_EXTGLOB, :FNM_SYSCASE, :FNM_NOESCAPE, :FNM_PATHNAME, :FNM_DOTMATCH, :FNM_SHORTNAME]
|
149
|
+
# irb(main):062:0> File::constants::keep_if { _1.to_s.include?('FNM_') }.map(&File::method(:const_get))
|
150
|
+
# => [8, 16, 0, 1, 2, 4, 0]
|
151
|
+
#
|
152
|
+
#
|
153
|
+
# - `File::FNM_PATHNAME` controls wildcards in the haystack matching `File::SEPARATOR` in the needle:
|
154
|
+
#
|
155
|
+
# irb> File.fnmatch?('*.jpg', '/hello.jpg', File::FNM_PATHNAME)
|
156
|
+
# => false
|
157
|
+
# irb> File.fnmatch?('*.jpg', '/hello.jpg')
|
158
|
+
# => true
|
159
|
+
# irb> File.fnmatch?('*.jpg', 'hello.jpg', File::FNM_PATHNAME)
|
160
|
+
# => true
|
161
|
+
# irb> File.fnmatch?('*.jpg', 'hello.jpg')
|
162
|
+
# => true
|
163
|
+
#
|
164
|
+
#
|
165
|
+
# - `File::FNM_DOTMATCH` controls wildcard in the haystack matching `.` in the needle, like *nix-style "hidden" files:
|
166
|
+
#
|
167
|
+
# irb> File.fnmatch?('*.jpg', '.hello.jpg', File::FNM_DOTMATCH)
|
168
|
+
# => true
|
169
|
+
# irb> File.fnmatch?('*.jpg', '.hello.jpg')
|
170
|
+
# => false
|
171
|
+
#
|
172
|
+
#
|
173
|
+
# - `File::FNM_EXTGLOB` controls support for brace-delimited glob syntax for haystacks:
|
174
|
+
#
|
175
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpg', File::FNM_EXTGLOB)
|
176
|
+
# => true
|
177
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpeg', File::FNM_EXTGLOB)
|
178
|
+
# => true
|
179
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpeg')
|
180
|
+
# => false
|
181
|
+
# irb> File.fnmatch?('*.jp{e,}g', 'hello.jpg')
|
182
|
+
# => false
|
183
|
+
#
|
184
|
+
#
|
185
|
+
# - `File::FNM_CASEFOLD` and `File::FNM_SYSCASE` control the case-sensitivity when matching,
|
186
|
+
# either by folding (explicit case-insensitivity) or by matching the behavior of the host operating system,
|
187
|
+
# *not* the behavior of any specific filesystem on that OS (https://bugs.ruby-lang.org/issues/15363),
|
188
|
+
# e.g. case-sensitive on BSD/Linux:
|
189
|
+
#
|
190
|
+
# irb> RUBY_PLATFORM
|
191
|
+
# => "x86_64-linux"
|
192
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense', File::FNM_SYSCASE)
|
193
|
+
# => false
|
194
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense', File::FNM_CASEFOLD)
|
195
|
+
# => true
|
196
|
+
# irb> File.fnmatch?('LOICENSE', 'loicense')
|
197
|
+
# => false
|
198
|
+
#
|
199
|
+
#
|
200
|
+
# - `File::FNM_NOESCAPE` (ominously) controls matching escape sequences literally:
|
201
|
+
# https://github.com/ruby/ruby/blob/master/doc/syntax/literals.rdoc#label-Strings
|
202
|
+
#
|
203
|
+
# irb> File.fnmatch?("*.jpg\\", 'hello.jpg', File::FNM_NOESCAPE)
|
204
|
+
# => false
|
205
|
+
# irb> File.fnmatch?("*.jpg\\", 'hello.jpg')
|
206
|
+
# => true
|
207
|
+
#
|
208
|
+
#
|
209
|
+
# - `File::FNM_SHORTNAME` seems to control eight-dot-three filename matching, per the documentation:
|
210
|
+
# "Makes patterns to match short names if existing. Valid only on Microsoft Windows."
|
211
|
+
#
|
212
|
+
#
|
213
|
+
# - Multiple of these Integer Constants can be bitwise-`OR`ed together for simultaneous use:
|
214
|
+
#
|
215
|
+
# irb> File.fnmatch?('*.jp{e,}g', '/root/.HeLlO.jPEg', File::FNM_EXTGLOB | File::FNM_CASEFOLD | File::FNM_DOTMATCH)
|
216
|
+
# => true
|
217
|
+
# irb> File.fnmatch?('*.jp{e,}g', '/root/.HeLlO.jPEg', File::FNM_EXTGLOB | File::FNM_CASEFOLD | File::FNM_DOTMATCH | File::FNM_PATHNAME)
|
218
|
+
# => false
|
219
|
+
File.fnmatch?(
|
220
|
+
self.itself, # Haystack
|
221
|
+
otra.itself, # Needle
|
222
|
+
File::FNM_DOTMATCH |
|
223
|
+
File::FNM_EXTGLOB |
|
224
|
+
(
|
225
|
+
# Support testing `otra` as either another `StickAround` or as a plain `String`,
|
226
|
+
# in which case it will not have a method `#case_sensitive?`.
|
227
|
+
# Use our own case-sensitivity setting when comparing against plain `Strings`.
|
228
|
+
(self.case_sensitive? or (otra.respond_to?(:case_sensitive?) ? otra.case_sensitive? : self.case_sensitive?)) ?
|
229
|
+
0 : File::FNM_CASEFOLD
|
230
|
+
)
|
231
|
+
)
|
232
|
+
end # eql?
|
233
|
+
|
234
|
+
# Hash-key usage depends on `#eql?`, but `:==` should have identical behavior for our own uses.
|
235
|
+
alias_method(:==, :eql?)
|
236
|
+
|
237
|
+
# Return an Integer hash value for this object. This method and `#eql?` are used by `Hash`, `Set`, and `#uniq` to
|
238
|
+
# associate separate Objects with each other for deduplication or for use as `Hash` keys.
|
239
|
+
# The `eql?` method will be called only *after* two Integer `#hash` values match!
|
240
|
+
#
|
241
|
+
# NOTE: MRI will not use this function in many cases!
|
242
|
+
# It has C implementations of methods like `rb_str_hash_cmp` for `Hash` lookups, and this is usually a Good Thing™
|
243
|
+
# since it makes `Hash`es fast when using `String` or `Symbol` as keys.
|
244
|
+
# Subclassing built-in types like `String` allows/forces us to use these same accelerated code paths,
|
245
|
+
# and it was incredibly confusing for me why my custom String subclass was behaving so strangely
|
246
|
+
# when used as a Hash key until I had a hunch to read MRI's `string.c` and `hash.c` and confirmed.
|
247
|
+
# I found this write-up once I knew to search for "rb_str_hash_cmp": https://kate.io/blog/strange-hash-instances-in-ruby/
|
248
|
+
#
|
249
|
+
# I'm going to define this anyway because it could still be useful in certain corner cases, but be aware of the above!
|
250
|
+
# This is the reason I explicitly `downcase` our self value in `#replace`, because otherwise the Hash keys will never match
|
251
|
+
# and `#eql?` will never even be called.
|
252
|
+
def hash
|
253
|
+
if self.include?(-?*) and not self.start_with?(-?*) then self[...6].downcase!.hash
|
254
|
+
elsif self.include?(-?*) and not File.extname(self).empty? then File.extname(self).delete_prefix!(-?.)
|
255
|
+
else super
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end # StickAround
|
260
|
+
end # class CHECKING::YOU
|