distorted 0.5.4 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +661 -0
- data/README.md +5 -140
- data/bin/console +14 -0
- data/bin/distorted +6 -0
- data/bin/setup +8 -0
- data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/Less_Perfect_DOS_VGA.png +0 -0
- data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/More_Perfect_DOS_VGA.png +0 -0
- data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/Perfect_DOS_VGA.png +0 -0
- data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/less_more_perfect_dos_vga_437.html +52 -0
- data/font/1252/LICENSE/PerfectDOSVGA437/font-comment.php@file=perfect_dos_vga_437.html +5 -0
- data/font/1252/LessPerfectDOSVGA.ttf +0 -0
- data/font/1252/MorePerfectDOSVGA.ttf +0 -0
- data/font/1252/Perfect DOS VGA 437 Win.ttf +0 -0
- data/font/437/Perfect DOS VGA 437.ttf +0 -0
- data/font/437/dos437.txt +72 -0
- data/font/65001/Anonymous Pro B.ttf +0 -0
- data/font/65001/Anonymous Pro BI.ttf +0 -0
- data/font/65001/Anonymous Pro I.ttf +0 -0
- data/font/65001/Anonymous Pro.ttf +0 -0
- data/font/65001/LICENSE/AnonymousPro/FONTLOG.txt +45 -0
- data/font/65001/LICENSE/AnonymousPro/OFL-FAQ.txt +235 -0
- data/font/65001/LICENSE/AnonymousPro/OFL.txt +94 -0
- data/font/65001/LICENSE/AnonymousPro/README.txt +55 -0
- data/font/850/ProFont-Bold-01/LICENSE +22 -0
- data/font/850/ProFont-Bold-01/readme.txt +28 -0
- data/font/850/ProFontWindows-Bold.ttf +0 -0
- data/font/850/ProFontWindows.ttf +0 -0
- data/font/850/Profont/LICENSE +22 -0
- data/font/850/Profont/readme.txt +31 -0
- data/font/932/LICENSE/README-ttf.txt +213 -0
- data/font/932/mona.ttf +0 -0
- data/lib/distorted.rb +2 -0
- data/lib/distorted/checking_you_out.rb +219 -0
- data/lib/distorted/checking_you_out/README +4 -0
- data/lib/distorted/checking_you_out/application.yaml +33 -0
- data/lib/distorted/checking_you_out/font.yaml +29 -0
- data/lib/distorted/checking_you_out/image.yaml +108 -0
- data/lib/distorted/click_again.rb +333 -0
- data/lib/distorted/element_of_media.rb +2 -0
- data/lib/distorted/element_of_media/change.rb +119 -0
- data/lib/distorted/element_of_media/compound.rb +120 -0
- data/lib/distorted/error_code.rb +51 -0
- data/lib/distorted/floor.rb +17 -0
- data/lib/distorted/invoker.rb +97 -0
- data/lib/distorted/media_molecule.rb +58 -0
- data/lib/distorted/media_molecule/font.rb +195 -0
- data/lib/distorted/media_molecule/image.rb +33 -0
- data/lib/distorted/media_molecule/pdf.rb +44 -0
- data/lib/distorted/media_molecule/svg.rb +45 -0
- data/lib/distorted/media_molecule/text.rb +203 -0
- data/lib/distorted/media_molecule/video.rb +18 -0
- data/lib/distorted/modular_technology/gstreamer.rb +174 -0
- data/lib/distorted/modular_technology/pango.rb +90 -0
- data/lib/distorted/modular_technology/ttfunk.rb +48 -0
- data/lib/distorted/modular_technology/vips.rb +17 -0
- data/lib/distorted/modular_technology/vips/foreign.rb +489 -0
- data/lib/distorted/modular_technology/vips/load.rb +133 -0
- data/lib/distorted/modular_technology/vips/save.rb +161 -0
- data/lib/distorted/monkey_business/encoding.rb +317 -0
- data/lib/distorted/monkey_business/hash.rb +18 -0
- data/lib/distorted/monkey_business/set.rb +15 -0
- data/lib/distorted/monkey_business/string.rb +6 -0
- data/lib/distorted/triple_counter.rb +52 -0
- data/lib/distorted/version.rb +22 -0
- data/test/distorted_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +130 -20
@@ -0,0 +1,133 @@
|
|
1
|
+
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
require 'distorted/checking_you_out'
|
5
|
+
require 'distorted/modular_technology/vips/foreign'
|
6
|
+
require 'distorted/modular_technology/vips/save'
|
7
|
+
|
8
|
+
|
9
|
+
module Cooltrainer; end
|
10
|
+
module Cooltrainer::DistorteD; end
|
11
|
+
module Cooltrainer::DistorteD::Technology; end
|
12
|
+
module Cooltrainer::DistorteD::Technology::Vips::Load
|
13
|
+
|
14
|
+
# Returns a Set of MIME::Types based on libvips LipsForeignLoad capabilities.
|
15
|
+
# NOTE: libvips only declares support (via :get_suffixes) for the "saver" types,
|
16
|
+
# but libvips can use additional external libraries for wider media-types support, e.g.:
|
17
|
+
#
|
18
|
+
# - SVG with librsvg2★ / libcairo. [*]
|
19
|
+
# - PDF with PDFium if available, otherwise with libpoppler-glib / libcairo.
|
20
|
+
# - OpenEXR/libIlmImf — ILM high dynamic range image format.
|
21
|
+
# - maybe more: https://github.com/libvips/libvips/blob/master/configure.ac
|
22
|
+
#
|
23
|
+
# [FITS]: https://heasarc.gsfc.nasa.gov/docs/heasarc/fits.html
|
24
|
+
#
|
25
|
+
# [RSVG2]: This is the normal SVG library for the GNOME/GLib world and is
|
26
|
+
# probably fine for 95% of use-cases, but I'm pissed off at it because of:
|
27
|
+
#
|
28
|
+
# - https://gitlab.gnome.org/GNOME/librsvg/-/issues/56
|
29
|
+
# - https://gitlab.gnome.org/GNOME/librsvg/-/issues/100
|
30
|
+
# - https://gitlab.gnome.org/GNOME/librsvg/-/issues/183
|
31
|
+
# - https://gitlab.gnome.org/GNOME/librsvg/-/issues/494
|
32
|
+
# - https://bugzilla.gnome.org/show_bug.cgi?id=666477
|
33
|
+
# - https://phabricator.wikimedia.org/T35245
|
34
|
+
#
|
35
|
+
# TLDR: SVG <tspan> elements' [:x, :y, :dy, :dx] attributes can be
|
36
|
+
# a space-delimited list of position values for individual
|
37
|
+
# characters in the <tspan>, but librsvg2 only supported reading
|
38
|
+
# those attributes as a single one-shot numeric value.
|
39
|
+
# Documents using this totally-common and totally-in-spec feature
|
40
|
+
# rendered incorrectly with librsvg2. Effected <tspan> elements'
|
41
|
+
# subsequent children would hug one edge of the rendered output.
|
42
|
+
#
|
43
|
+
# And wouldn't you know it but the one (1) SVG on my website
|
44
|
+
# at the time I built this feature (IIDX-Turntable-parts.svg) used
|
45
|
+
# this feature for the double-digit parts diagram labels.
|
46
|
+
# I ended up having to edit my input document to just squash the
|
47
|
+
# offending <tspan>s down to a single child each.
|
48
|
+
# I guess that's semantically more correct in my document since they are
|
49
|
+
# numbers like Eleven and not two separate characters like '1 1'
|
50
|
+
# but still ugh lol
|
51
|
+
#
|
52
|
+
# This was finally fixed in 2019 as of librsvg2 version 2.45.91 :)
|
53
|
+
# https://gitlab.gnome.org/GNOME/librsvg/-/issues/494#note_579774
|
54
|
+
#
|
55
|
+
# [MAGICK]: The Magick-based '.bmp' loader is broken/missing in libvips <= 8.9.1,
|
56
|
+
# but our automatic Loader detection will handle that. Just FYI :)
|
57
|
+
#
|
58
|
+
VIPS_LOADERS = Cooltrainer::DistorteD::Technology::Vips::vips_get_types('VipsForeignLoad').keep_if { |t|
|
59
|
+
Array[
|
60
|
+
t.media_type != 'application'.freeze, # e.g. application/pdf
|
61
|
+
t.media_type != 'text'.freeze, # e.g. text/csv
|
62
|
+
].all? && Array[
|
63
|
+
t.sub_type.include?('zip'.freeze),
|
64
|
+
# Skip declaring SVG here since I want to handle it in a Vector-only Molecule
|
65
|
+
# and will re-declare this there. Prolly need to think up a better way to do this.
|
66
|
+
t.sub_type.include?('svg'.freeze),
|
67
|
+
].none?
|
68
|
+
}
|
69
|
+
|
70
|
+
# Vips::vips_foreign_find_save is based on filename suffix (extension),
|
71
|
+
# but :vips_foreign_find_load seems to be based on file magic.
|
72
|
+
# That is, we can't `vips_foreign_find_load` for a made-up filename
|
73
|
+
# or plain suffix like we can to to build 'vips/save'::OUTER_LIMITS.
|
74
|
+
# This caught me off guard but doesn't *entirely* not-make-sense,
|
75
|
+
# considering Vips::Image::new_from_filename calls :vips_foreign_find_load
|
76
|
+
# and obviously expects a file to be present.
|
77
|
+
#
|
78
|
+
## Example — works with real file and fails with only suffix:
|
79
|
+
# irb> Vips::vips_foreign_find_load '/home/okeeblow/cover.jpg'
|
80
|
+
# => "VipsForeignLoadJpegFile"
|
81
|
+
# irb> Vips::vips_foreign_find_load 'cover.jpg'
|
82
|
+
# => nil
|
83
|
+
#
|
84
|
+
## Syscalls of successful real-file :vips_foreign_find_load call
|
85
|
+
# showing how it works:
|
86
|
+
# [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; Vips::vips_foreign_find_load '/home/okeeblow/cover.jpg'" 2>&1|grep cover.jpg
|
87
|
+
# access("/home/okeeblow/cover.jpg", R_OK) = 0
|
88
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
|
89
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
|
90
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
|
91
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
|
92
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
|
93
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
|
94
|
+
# lstat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
|
95
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
|
96
|
+
# stat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
|
97
|
+
# stat("/home/okeeblow/cover.jpg-journal", 0x7fffa70f4df0) = -1 ENOENT (No such file or directory)
|
98
|
+
# stat("/home/okeeblow/cover.jpg-wal", 0x7fffa70f4df0) = -1 ENOENT (No such file or directory)
|
99
|
+
# stat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
|
100
|
+
# openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
|
101
|
+
#
|
102
|
+
## …and of a fake suffix-only filename to show how it doesn't:
|
103
|
+
# [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; Vips::vips_foreign_find_load 'fartbutt.jpg'" 2>&1|grep '.jpg'
|
104
|
+
# read(5, ".write_to_target target, \".jpg[Q"..., 8192) = 8192
|
105
|
+
# access("fartbutt.jpg", R_OK) = -1 ENOENT (No such file or directory)
|
106
|
+
#
|
107
|
+
## Versus the corresponding Vips::vips_foreign_find_save which is *only* based
|
108
|
+
# on filename suffix and does not try to look at a file at all,
|
109
|
+
# perhaps (read: obviously) because that file wouldn't exist yet to test until we save it :)
|
110
|
+
# [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; p Vips::vips_foreign_find_save 'fartbutt.jpg'" 2>&1|grep -E 'Save|.jpg'
|
111
|
+
# read(5, ".write_to_target target, \".jpg[Q"..., 8192) = 8192
|
112
|
+
# write(1, "\"VipsForeignSaveJpegFile\"\n", 26"VipsForeignSaveJpegFile"
|
113
|
+
#
|
114
|
+
# For this reason I'm going to write my own shim Loader-finder and use it instead.
|
115
|
+
LOWER_WORLD = VIPS_LOADERS.reduce(Hash[]) { |types,type|
|
116
|
+
types[type] = Cooltrainer::DistorteD::Technology::Vips::vips_get_options(
|
117
|
+
Cooltrainer::DistorteD::Technology::Vips::vips_foreign_find_load_suffix(".#{type.preferred_extension}")
|
118
|
+
)
|
119
|
+
types
|
120
|
+
}
|
121
|
+
|
122
|
+
|
123
|
+
def to_vips_image
|
124
|
+
# TODO: Learn more about what VipsAccess means for our use case,
|
125
|
+
# if the default should be changed, and if it should be
|
126
|
+
# a user-controllable attr or not.
|
127
|
+
# https://libvips.github.io/libvips/API/current/VipsImage.html#VipsAccess
|
128
|
+
# https://libvips.github.io/libvips/API/current/How-it-opens-files.md.html
|
129
|
+
@vips_image ||= Vips::Image.new_from_file(path)
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
|
2
|
+
# Requiring libvips 8.8 for HEIC/HEIF (moo) support, `justify` support in the
|
3
|
+
# Vips::Image text operator, animated WebP support, and more:
|
4
|
+
# https://libvips.github.io/libvips/2019/04/22/What's-new-in-8.8.html
|
5
|
+
|
6
|
+
require 'distorted/element_of_media'
|
7
|
+
require 'distorted/triple_counter'
|
8
|
+
VIPS_MINIMUM_VER = TripleCounter.new(8, 8, 0)
|
9
|
+
|
10
|
+
# Tell the user to install the shared library if it's missing.
|
11
|
+
begin
|
12
|
+
require 'vips'
|
13
|
+
VIPS_AVAILABLE_VER = TripleCounter.new(Vips::version(0), Vips::version(1), Vips::version(2))
|
14
|
+
|
15
|
+
unless VIPS_AVAILABLE_VER >= VIPS_MINIMUM_VER
|
16
|
+
raise LoadError.new(
|
17
|
+
"DistorteD needs libvips #{VIPS_MINIMUM_VER}, but the available version is '#{Vips::version_string}'"
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
rescue LoadError => le
|
22
|
+
# Only match libvips.so load failure
|
23
|
+
raise unless le.message =~ /libvips.so/
|
24
|
+
|
25
|
+
# Multiple OS help
|
26
|
+
help = <<~INSTALL
|
27
|
+
|
28
|
+
Please install the VIPS (libvips) image processing library, version #{VIPS_MINIMUM_VER} or later.
|
29
|
+
|
30
|
+
FreeBSD:
|
31
|
+
pkg install graphics/vips
|
32
|
+
|
33
|
+
macOS:
|
34
|
+
brew install vips
|
35
|
+
|
36
|
+
Debian/Ubuntu/Mint:
|
37
|
+
apt install libvips libvips-dev
|
38
|
+
INSTALL
|
39
|
+
|
40
|
+
# Re-raise with install message
|
41
|
+
raise $!, "#{help}\n#{$!}", $!.backtrace
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
require 'set'
|
46
|
+
|
47
|
+
require 'distorted/checking_you_out'
|
48
|
+
|
49
|
+
|
50
|
+
module Cooltrainer; end
|
51
|
+
module Cooltrainer::DistorteD; end
|
52
|
+
module Cooltrainer::DistorteD::Technology; end
|
53
|
+
module Cooltrainer::DistorteD::Technology::Vips::Save
|
54
|
+
|
55
|
+
|
56
|
+
# There is one (only one) native libvips image format, with file extname `.vips`.
|
57
|
+
# As I write this—running libvips 8.8—the :get_suffixes function does not include
|
58
|
+
# its own '.vips' as a supported extension.
|
59
|
+
# There also (as of mid 2020) seems to be no official media-type assigned
|
60
|
+
# for VIPS format, so I am going to make one up in CHECKING::YOU::OUT's local-data.
|
61
|
+
# - Raw pixel data
|
62
|
+
#
|
63
|
+
# [RAW]: https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-rawload
|
64
|
+
# https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-csvload
|
65
|
+
#
|
66
|
+
# Most libvips installations, even very minimally-built ones,
|
67
|
+
# will almost certainly support a few very common formats:
|
68
|
+
# - JPEG with libjpeg.
|
69
|
+
# - PNG with libpng.
|
70
|
+
# - GIF with giflib.
|
71
|
+
# - WebP with libwebp.
|
72
|
+
# - TIFF with libtiff.
|
73
|
+
#
|
74
|
+
# Normal libvips installations probably also support many less-mainstream formats:
|
75
|
+
# - HEIF/HEIC with libheif.
|
76
|
+
# - ICC profiles with liblcms2.
|
77
|
+
# - Matlab with matio/libhdf5.
|
78
|
+
# - FITS★ with cfitsio.
|
79
|
+
# - Styled text with Pango/ft2.
|
80
|
+
# - Saving GIF/BMP with Magick.
|
81
|
+
# NOTE that GIFs are *loaded* using giflib,
|
82
|
+
# and that BMP loading is unsupported.
|
83
|
+
# - Various simple ASCII/binary-based formats with libgsf★
|
84
|
+
# · Comma-separated values
|
85
|
+
# · Netpbm★
|
86
|
+
# · VIPS (non-Matlab) matrices★
|
87
|
+
#
|
88
|
+
# [NETPBM]: https://en.wikipedia.org/wiki/Netpbm#File_formats
|
89
|
+
# [LIBGSF]: https://developer.gnome.org/gsf/
|
90
|
+
# [MATRIX]: https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-matrixload
|
91
|
+
|
92
|
+
# Vips allows us to query supported *SAVE* types by suffix.
|
93
|
+
# There's a simple relationship between filetype and extension since
|
94
|
+
# libvips uses the suffix to pick the Saver module.
|
95
|
+
# https://libvips.github.io/libvips/API/current/VipsForeignSave.html
|
96
|
+
#
|
97
|
+
# Loader modules, on the other hand, are picked by sniffing the
|
98
|
+
# first few bytes of the file, so a list of file extensions for
|
99
|
+
# supported loadable formats won't always be complete.
|
100
|
+
# For example, SVG and PDF are usually supported as loaders
|
101
|
+
# (via rsvg and PDFium/Poppler)
|
102
|
+
# https://github.com/libvips/ruby-vips/issues/186
|
103
|
+
#
|
104
|
+
# irb(main)> Vips.get_suffixes
|
105
|
+
# => [".csv", ".mat", ".v", ".vips", ".ppm", ".pgm", ".pbm", ".pfm",
|
106
|
+
# ".hdr", ".dz", ".png", ".jpg", ".jpeg", ".jpe", ".webp", ".tif",
|
107
|
+
# ".tiff", ".fits", ".fit", ".fts", ".gif", ".bmp"]
|
108
|
+
VIPS_SAVERS = Vips.get_suffixes.map{ |t|
|
109
|
+
# A single call to this will return a Set of MIME::Types for a String input
|
110
|
+
CHECKING::YOU::OUT(t)
|
111
|
+
}.reduce { |c,t|
|
112
|
+
# Flatten the Set-of-Sets-of-Types into a Set-of-Types
|
113
|
+
(c || Set[]).merge(t)
|
114
|
+
}.keep_if { |t|
|
115
|
+
# Filter out any of libvips' supported output Types that aren't
|
116
|
+
# actually images (e.g. CSV)
|
117
|
+
t.media_type == 'image'
|
118
|
+
}
|
119
|
+
|
120
|
+
OUTER_LIMITS = VIPS_SAVERS.reduce(Hash[]) { |types,type|
|
121
|
+
types[type] = Cooltrainer::DistorteD::Technology::Vips::vips_get_options(
|
122
|
+
Vips::vips_foreign_find_save(".#{type.preferred_extension}")
|
123
|
+
)
|
124
|
+
types
|
125
|
+
}
|
126
|
+
|
127
|
+
# Define a to_<mediatype>_<subtype> method for each MIME::Type supported by libvips,
|
128
|
+
# e.g. a supported Type 'image/png' will define a method :to_image_png in any
|
129
|
+
# context where this module is included.
|
130
|
+
self::OUTER_LIMITS.each_key { |t|
|
131
|
+
define_method(t.distorted_file_method) { |dest_root, change|
|
132
|
+
vips_save(dest_root, change)
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
protected
|
137
|
+
|
138
|
+
# Generic Vips saver method, optionally handling resizing and cropping.
|
139
|
+
# NOTE: libvips chooses a saver (internally) based on the extname of the destination path.
|
140
|
+
# TODO: String-buffer version of this method using e.g. Image#jpegsave_buffer
|
141
|
+
def vips_save(dest_root, change)
|
142
|
+
begin
|
143
|
+
to_vips_image.write_to_file(change.paths(dest_root).first)
|
144
|
+
change.breaks.each { |b|
|
145
|
+
ver = to_vips_image.thumbnail_image(
|
146
|
+
b.to_int,
|
147
|
+
**{:crop => change.crop || :none},
|
148
|
+
)
|
149
|
+
ver.write_to_file(change.path(dest_root, b))
|
150
|
+
}
|
151
|
+
rescue Vips::Error => v
|
152
|
+
if v.message.include?('No known saver')
|
153
|
+
# TODO: Handle missing output formats. Replacements? Skip it? Die?
|
154
|
+
return nil
|
155
|
+
else
|
156
|
+
raise
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end # save
|
160
|
+
|
161
|
+
end
|
@@ -0,0 +1,317 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
## Adds better support to Ruby's Encoding class for IBM/Microsoft-style numeric codepage IDs:
|
4
|
+
# - Adds a `:code_page` method on any instance of Encoding, returning the Integer codepage ID.
|
5
|
+
# - Adds a `:page_code` singleton method on :Encoding, returning the Encoding instance for any Integer codepage ID.
|
6
|
+
# - Patches Encoding::find() to add Integer and numeric-String find() support using :page_code.
|
7
|
+
# - Adds a `:code_page_orphans` singleton method on :Encoding, returning a Set of built-in Encodings that
|
8
|
+
# do not have a corresponding built-in `:CP<####>` name/constant.
|
9
|
+
# - Includes many additional numeric codepage IDs based on information
|
10
|
+
# from IANA, Unicode Consortium, OS vendors, and some of my own speculation.
|
11
|
+
|
12
|
+
## This is similar in effect (but not in implementation) to the 'encoding-codepage' Gem:
|
13
|
+
# https://github.com/ConradIrwin/encoding-codepage
|
14
|
+
# My choice of method name `:code_page` was intentional to avoid conflict with this Gem's `:codepage`.
|
15
|
+
|
16
|
+
|
17
|
+
## Notes-To-Self about Encoding (the class I'm patching) and encodings in general:
|
18
|
+
|
19
|
+
## Unicode Normalization Forms
|
20
|
+
#
|
21
|
+
# - Canonical composed (NFC) and decomposed (NFD) forms.
|
22
|
+
# - Non-canonical composed (NFKC) and decomposed (NFKD) forms.
|
23
|
+
#
|
24
|
+
# "For example, form C uses the single Unicode code point "Ä" (U+00C4),
|
25
|
+
# while form D uses ("A" + "¨", that is U+0041 U+0308).
|
26
|
+
# These render identically, because "¨" (U+0308) is a combining character."
|
27
|
+
#
|
28
|
+
# http://www.unicode.org/faq/normalization.html
|
29
|
+
# http://www.unicode.org/reports/tr15/
|
30
|
+
# https://docs.microsoft.com/en-us/windows/win32/intl/using-unicode-normalization-to-represent-strings
|
31
|
+
# https://en.wikipedia.org/wiki/Precomposed_character
|
32
|
+
# https://en.wikipedia.org/wiki/Unicode_equivalence
|
33
|
+
#
|
34
|
+
# HFS+ is a notable outlier among filesystems by requiring decomposed form (actually 'UTF-8-Mac' variant).
|
35
|
+
|
36
|
+
## Ruby includes a lot of the desired codepoint ID data built-in,
|
37
|
+
# but in the form of String alias names for Encoding instances,
|
38
|
+
# e.g. KOI8-R is also codepage 878:
|
39
|
+
#
|
40
|
+
# irb> Encoding::KOI8_R.names
|
41
|
+
# => ["KOI8-R", "CP878"]
|
42
|
+
#
|
43
|
+
# irb> Encoding::KOI8_R.names.any?{ |n| n =~ /^(CP|IBM|Windows[-_])(?<code_page>\d{3,}$)/ }
|
44
|
+
# => true
|
45
|
+
# irb> Regexp.last_match
|
46
|
+
# => #<MatchData "CP878" code_page:"878">
|
47
|
+
#
|
48
|
+
# My code defers to this built-in data where possible instead of doing
|
49
|
+
# a complete import of the Microsoft identifiers like the Gem.
|
50
|
+
|
51
|
+
|
52
|
+
## Some encodings have both generic and vendor-prefixed names,
|
53
|
+
## and some are canonically one or the other, e.g.:
|
54
|
+
#
|
55
|
+
# irb> Encoding::IBM437
|
56
|
+
# => #<Encoding:IBM437>
|
57
|
+
# irb> Encoding::CP437
|
58
|
+
# => #<Encoding:IBM437>
|
59
|
+
#
|
60
|
+
# irb> Encoding::IBM850
|
61
|
+
# => #<Encoding:CP850>
|
62
|
+
# irb> Encoding::CP850
|
63
|
+
# => #<Encoding:CP850>
|
64
|
+
|
65
|
+
|
66
|
+
class Encoding
|
67
|
+
|
68
|
+
# Define a Regexp to match and extract Ruby's built-in numeric codepage IDs
|
69
|
+
# from thir Encoding's names.
|
70
|
+
#
|
71
|
+
# Using IGNORECASE to handle the duplicate differing-capitalization constants,
|
72
|
+
# e.g. Encoding::WINDOWS_31J and Encoding::Windows_31J both exist and are equivalent.
|
73
|
+
#
|
74
|
+
# Worth mentioning since this file deals with Encoding,
|
75
|
+
# but the Regexp itself also has an internal Encoding that can be changed
|
76
|
+
# if I had any reason to (I don't):
|
77
|
+
# https://ruby-doc.org/core/Regexp.html#class-Regexp-label-Encoding
|
78
|
+
CODE_PAGE_ENCODING_NAME = Regexp.new('^(CP|IBM|Windows[-_])(?<code_page>\d{3,}$)', Regexp::IGNORECASE)
|
79
|
+
|
80
|
+
# Data sources:
|
81
|
+
# https://www.aivosto.com/articles/charsets-codepages.html
|
82
|
+
# https://developer.apple.com/documentation/coreservices/1400434-ms-dos_and_windows_text_encodings
|
83
|
+
# https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
|
84
|
+
# https://en.wikipedia.org/wiki/CCSID
|
85
|
+
# https://github.com/SheetJS/js-codepage/blob/master/codepage.md
|
86
|
+
ADDITIONAL_ENCODING_CODE_PAGE_IDS = {
|
87
|
+
|
88
|
+
# Burgerland :911:
|
89
|
+
Encoding::US_ASCII => 20127,
|
90
|
+
|
91
|
+
# Unicode
|
92
|
+
Encoding::UTF_16LE => 1200,
|
93
|
+
Encoding::UTF_16BE => 1201,
|
94
|
+
Encoding::UTF_32LE => 12000,
|
95
|
+
Encoding::UTF_32BE => 12001,
|
96
|
+
|
97
|
+
## 245
|
98
|
+
#
|
99
|
+
# Code Page 932 is Windows-31J, but I want to provide fallback mapping
|
100
|
+
# between 932 and Shift_JIS to handle detected-text or `encoding` arguments
|
101
|
+
# that return Shift_JIS since that naming is much much more well-known than 31J.
|
102
|
+
Encoding::SHIFT_JIS => 932,
|
103
|
+
# https://referencesource.microsoft.com/#mscorlib/system/text/eucjpencoding.cs
|
104
|
+
# https://www.redmine.org/issues/29442
|
105
|
+
# https://www.sljfaq.org/afaq/encodings.html
|
106
|
+
# https://uic.jp/charset/
|
107
|
+
# http://www.monyo.com/technical/samba/docs/Japanese-HOWTO-3.0.en.txt
|
108
|
+
Encoding::EUC_JP_MS => 20932,
|
109
|
+
Encoding::EUC_JP => 51932,
|
110
|
+
# Encoding:EUC-JIS-2004 dunno
|
111
|
+
#
|
112
|
+
# https://www.debian.org/doc/manuals/intro-i18n/ch-coding.en.html 3.2: Stateless and Stateful
|
113
|
+
# TL;DR: Stateful uses an escape sequence to switch charset;
|
114
|
+
# Stateless have all-unique codepoints.
|
115
|
+
# Normal ISO-2022-JP is stateful.
|
116
|
+
# "For example, in ISO 2022-JP, two bytes of 0x24 0x2c may mean a Japanese Hiragana character 'が'
|
117
|
+
# or two ASCII character of '$' and ',' according to the shift state."
|
118
|
+
# Encoding::STATELESS_ISO_2022_JP
|
119
|
+
#
|
120
|
+
# Mobile operator specific encodings that I have no numeric IDs for rn:
|
121
|
+
# Encoding:UTF8-DoCoMo
|
122
|
+
# Encoding:SJIS-DoCoMo
|
123
|
+
# Encoding:UTF8-KDDI
|
124
|
+
# Encoding:SJIS-KDDI
|
125
|
+
# Encoding:stateless-ISO-2022-JP-KDDI
|
126
|
+
# Encoding:UTF8-SoftBank
|
127
|
+
# Encoding:SJIS-SoftBank
|
128
|
+
|
129
|
+
## CHY-NAH
|
130
|
+
#
|
131
|
+
# https://en.wikipedia.org/wiki/Code_page_903
|
132
|
+
Encoding::GB1988 => 903,
|
133
|
+
#
|
134
|
+
## Hong Kong Supplementary Character Set
|
135
|
+
# The Windows version of this seems to be the built-in CP951:
|
136
|
+
# https://web.archive.org/web/20160402215421/https://blogs.msdn.microsoft.com/shawnste/2007/03/12/cp-951-hkscs/
|
137
|
+
# https://web.archive.org/web/20141129233053/http://www-01.ibm.com/software/globalization/ccsid/ccsid5471.html
|
138
|
+
Encoding::BIG5_HKSCS => 5417,
|
139
|
+
#
|
140
|
+
# The 936 postfix is a reference to the standard Windows Chinese encoding being CP936 / GBK.
|
141
|
+
# "GB2312 is the registered internet name for EUC-CN, which is its usual encoded form."
|
142
|
+
Encoding::GB2312 => 20936,
|
143
|
+
Encoding::GB12345 => 51936,
|
144
|
+
#Encoding:GB2312_HZ => 52936, # Doesn't exist in Ruby
|
145
|
+
Encoding::GB18030 => 54936,
|
146
|
+
|
147
|
+
|
148
|
+
## Asia At Odd Hours
|
149
|
+
#
|
150
|
+
# I always wondered if the "Gravitational Pull of Pepsi" logo came from
|
151
|
+
# them wanting it to look less like the Korean flag.
|
152
|
+
# The traditional Korean Windows Code Page is CP949, available in Ruby
|
153
|
+
# but not under any other name aliases.
|
154
|
+
# IBM uses CP1363, not in Ruby.
|
155
|
+
Encoding::EUC_KR => 51949,
|
156
|
+
#
|
157
|
+
# ROC me now
|
158
|
+
Encoding::EUC_TW => 51950,
|
159
|
+
# Unicode 補完計畫 / Unicode-At-On is a Big5 variant once popular in Taiwan:
|
160
|
+
# https://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
|
161
|
+
# https://lists.w3.org/Archives/Public/public-html-ig-zh/2012Apr/0061.html
|
162
|
+
# Encoding::BIG5_UAO
|
163
|
+
#
|
164
|
+
# CP950 (available in Ruby) is the code page used on Windows under the name "big5',
|
165
|
+
# but I want to map the generic Big5 Encoding to 950 as well to handle
|
166
|
+
# detected and specified encodings by that name.
|
167
|
+
# "The major difference between Windows code page 950 and "common" (non-vendor-specific) Big5
|
168
|
+
# is the incorporation of a subset of the ETEN extensions to Big5 at 0xF9D6 through 0xF9FE
|
169
|
+
# (comprising the seven Chinese characters 碁, 銹, 裏, 墻, 恒, 粧, and 嫺,
|
170
|
+
# followed by 34 box drawing characters and block elements)."
|
171
|
+
Encoding::Big5 => 950,
|
172
|
+
#
|
173
|
+
# Encoding::TIS_620 is the base Thai 8-bit encoding standard that is apparently
|
174
|
+
# never actually used in the wild.
|
175
|
+
# ISO-8859-11 is identical to it with the sole exception "that ISO/IEC 8859-11
|
176
|
+
# allocates non-breaking space to code 0xA0, while TIS-620 leaves it undefined."
|
177
|
+
# "The Microsoft Windows code page 874 as well as the code page used in the
|
178
|
+
# Thai version of the Apple Macintosh, MacThai,
|
179
|
+
# are variants of TIS-620 — incompatible with each other, however."
|
180
|
+
|
181
|
+
|
182
|
+
# Eastern Yurp
|
183
|
+
#Encoding::KOI8_R => 20866,
|
184
|
+
Encoding::KOI8_U => 21866,
|
185
|
+
|
186
|
+
## ISO/IEC 8859 (8-bit) encoding family
|
187
|
+
#
|
188
|
+
Encoding::ISO_8859_1 => 28591, # West European languages (Latin-1)
|
189
|
+
Encoding::ISO_8859_2 => 28592, # Central and East European languages (Latin-2)
|
190
|
+
Encoding::ISO_8859_3 => 28593, # Southeast European and miscellaneous languages (Latin-3)
|
191
|
+
Encoding::ISO_8859_4 => 28594, # Scandinavian/Baltic languages (Latin-4)
|
192
|
+
Encoding::ISO_8859_5 => 28595, # Latin/Cyrillic
|
193
|
+
Encoding::ISO_8859_6 => 28596, # Latin/Arabic
|
194
|
+
Encoding::ISO_8859_7 => 28597, # Latin/Greek
|
195
|
+
Encoding::ISO_8859_8 => 28598, # Latin/Hebrew
|
196
|
+
Encoding::ISO_8859_9 => 28599, # Latin-1 modification for Turkish (Latin-5)
|
197
|
+
#
|
198
|
+
# ISO-8859-10 covers Nordic languages better than ISO_8859_4.
|
199
|
+
# Wikipedia says this has been assigned in Windows as 28600 even though Microsoft's
|
200
|
+
# page doesn't list it now in 2020, but w/e.
|
201
|
+
# IBM assigned it as CP919.
|
202
|
+
Encoding::ISO_8859_10 => 28600, # Lappish/Nordic/Eskimo languages (Latin-6)
|
203
|
+
#
|
204
|
+
# Wikipedia says this is assigned, but same deal.
|
205
|
+
Encoding::ISO_8859_11 => 28601, # Latin/Thai
|
206
|
+
#
|
207
|
+
# Intended Celtic encoding abandoned in 1997 in favor of ISO_8859_14:
|
208
|
+
# Encoding::ISO_8859_12 => 28602,
|
209
|
+
#
|
210
|
+
Encoding::ISO_8859_13 => 28603, # Baltic Rim languages (Latin-7)
|
211
|
+
Encoding::ISO_8859_14 => 28604, # Celtic (Latin-8)
|
212
|
+
Encoding::ISO_8859_15 => 28605, # West European languages (Latin-9)
|
213
|
+
Encoding::ISO_8859_16 => 28606, # Romanian (Latin-10)
|
214
|
+
|
215
|
+
# Apple encodings
|
216
|
+
#
|
217
|
+
# UTF8_MAC is the encoding Mac OS X uses on HFS+ filesystems and is a variant of UTF-8-NFD.
|
218
|
+
# https://web.archive.org/web/20140812023313/http://developer.apple.com/library/ios/documentation/MacOSX/Conceptual/BPInternational/Articles/FileEncodings.html
|
219
|
+
# "Mac OS Extended (HFS+) uses canonically decomposed Unicode 3.2 in UTF-16 format,
|
220
|
+
# which consists of a sequence of 16-bit codes.
|
221
|
+
# (Characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D are not decomposed.)"
|
222
|
+
#
|
223
|
+
# There isn't a good Microsoft-style ID I can assign to it, so this is just FYI.
|
224
|
+
|
225
|
+
# Classic Mac encodings
|
226
|
+
#
|
227
|
+
# https://en.wikipedia.org/wiki/Category:Mac_OS_character_encodings
|
228
|
+
# http://mirror.informatimago.com/next/developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.1b.html
|
229
|
+
#
|
230
|
+
# MacRoman pre-OS-8.5 has the "Universal currency symbol" at 0xDB,
|
231
|
+
# while 8.5 and later replace it with the (then-new) Euro symbol:
|
232
|
+
# https://en.wikipedia.org/wiki/Currency_sign_(typography)
|
233
|
+
Encoding::MACROMAN => 10000,
|
234
|
+
#
|
235
|
+
# "Shift-JIS with JIS Roman modifications, extra 1-byte characters, 2-byte Apple extensions,
|
236
|
+
# and some vertical presentation forms in the range 0xEB40--0xEDFE ("ku plus 84")."
|
237
|
+
# Ruby also defines Encoding::MACJAPAN but it's the same Encoding.
|
238
|
+
Encoding::MACJAPANESE => 10001,
|
239
|
+
#
|
240
|
+
# The following encodings are not defined in Ruby's Encoding class,
|
241
|
+
# but I'm listing them here for completeness' sake.
|
242
|
+
# MACCHINESETRAD => 10002,
|
243
|
+
# MACKOREAN => 10003,
|
244
|
+
# MACARABIC => 10004,
|
245
|
+
# MACHEBREW => 10005,
|
246
|
+
# MACGREEK => 10006,
|
247
|
+
# MACCYRILLIC => 10007,
|
248
|
+
# MACHINESESIMP => 10008,
|
249
|
+
#
|
250
|
+
# Unlike MacJapan/MacJapanese, MacRomania is something different than MacRoman.
|
251
|
+
Encoding::MACROMANIA => 10010,
|
252
|
+
#
|
253
|
+
Encoding::MACUKRAINE => 10017,
|
254
|
+
Encoding::MACTHAI => 10021,
|
255
|
+
Encoding::MACCENTEURO => 10029,
|
256
|
+
Encoding::MACICELAND => 10079,
|
257
|
+
Encoding::MACTURKISH => 10081,
|
258
|
+
Encoding::MACCROATIAN => 10082,
|
259
|
+
|
260
|
+
} # ADDITIONAL_ENCODING_CODE_PAGE_IDS
|
261
|
+
|
262
|
+
# Returns a Hash of the built-in-orphan Encodings we now have codepage IDs for,
|
263
|
+
# e.g. {#<Encoding:US-ASCII>=>20127, #<Encoding:UTF-16BE>=>1201, #<Encoding:UTF-16LE>=>1200}
|
264
|
+
def self.adopted_encoding_code_page_ids
|
265
|
+
@@adopted_encoding_code_page_ids ||= self::code_page_orphans.select{ |e|
|
266
|
+
if self::ADDITIONAL_ENCODING_CODE_PAGE_IDS.has_key?(e)
|
267
|
+
# irb> Encoding.const_defined?('CP932')
|
268
|
+
# => true
|
269
|
+
not Encoding::const_defined?("CP#{self::ADDITIONAL_ENCODING_CODE_PAGE_IDS[e]}")
|
270
|
+
else
|
271
|
+
false
|
272
|
+
end
|
273
|
+
}.map{ |e|
|
274
|
+
[e, self::ADDITIONAL_ENCODING_CODE_PAGE_IDS[e]]
|
275
|
+
}.to_h
|
276
|
+
end
|
277
|
+
|
278
|
+
# Returns a Set of built-in Encodings whose :names /!\ DO NOT /!\ contain a usable
|
279
|
+
# numeric codepage ID, as matched by our Regexp.
|
280
|
+
def self.code_page_orphans
|
281
|
+
Encoding.list.select{ |c|
|
282
|
+
c.respond_to?(:names) ? (not c.names.any?{|n| CODE_PAGE_ENCODING_NAME.match(n)}) : false
|
283
|
+
}.to_set
|
284
|
+
end
|
285
|
+
|
286
|
+
# Returns the Encoding instance of any Integer codepage ID.
|
287
|
+
def self.page_code(code_page_id)
|
288
|
+
# Every canonically-Windows*/IBM*-named Encoding seems to also have a 'CP<whatever>' equivalent.
|
289
|
+
Encoding::find("CP#{code_page_id}") rescue nil
|
290
|
+
end
|
291
|
+
|
292
|
+
# Returns the Integer codepage ID of any Encoding instance.
|
293
|
+
def code_page
|
294
|
+
Encoding::adopted_encoding_code_page_ids.dig(self) ||
|
295
|
+
self.names.any?{ |n| CODE_PAGE_ENCODING_NAME.match(n) } ?
|
296
|
+
Regexp.last_match['code_page'.freeze].to_i : nil
|
297
|
+
end
|
298
|
+
|
299
|
+
# Patch the Encoding::find() method to support taking Integer and numeric-String arguments
|
300
|
+
# in addition to the Symbol and canonical-String args it usually supports.
|
301
|
+
find_you_again = singleton_method(:find)
|
302
|
+
define_singleton_method(:find) do |code_page_id|
|
303
|
+
begin
|
304
|
+
if code_page_id.is_a?(Integer)
|
305
|
+
Encoding::page_code(code_page_id)
|
306
|
+
elsif code_page_id.to_i > 0
|
307
|
+
# String#to_i returns 0 for any non-entirely-numeric String
|
308
|
+
Encoding::page_code(code_page_id.to_i)
|
309
|
+
else
|
310
|
+
find_you_again.(code_page_id)
|
311
|
+
end
|
312
|
+
rescue RuntimeError => e
|
313
|
+
find_you_again.(code_page_id)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
end
|