distorted 0.5.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +661 -0
  3. data/README.md +5 -140
  4. data/bin/console +14 -0
  5. data/bin/distorted +6 -0
  6. data/bin/setup +8 -0
  7. data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/Less_Perfect_DOS_VGA.png +0 -0
  8. data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/More_Perfect_DOS_VGA.png +0 -0
  9. data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/img/Perfect_DOS_VGA.png +0 -0
  10. data/font/1252/LICENSE/MoreLessPerfectDOSVGA437/less_more_perfect_dos_vga_437.html +52 -0
  11. data/font/1252/LICENSE/PerfectDOSVGA437/font-comment.php@file=perfect_dos_vga_437.html +5 -0
  12. data/font/1252/LessPerfectDOSVGA.ttf +0 -0
  13. data/font/1252/MorePerfectDOSVGA.ttf +0 -0
  14. data/font/1252/Perfect DOS VGA 437 Win.ttf +0 -0
  15. data/font/437/Perfect DOS VGA 437.ttf +0 -0
  16. data/font/437/dos437.txt +72 -0
  17. data/font/65001/Anonymous Pro B.ttf +0 -0
  18. data/font/65001/Anonymous Pro BI.ttf +0 -0
  19. data/font/65001/Anonymous Pro I.ttf +0 -0
  20. data/font/65001/Anonymous Pro.ttf +0 -0
  21. data/font/65001/LICENSE/AnonymousPro/FONTLOG.txt +45 -0
  22. data/font/65001/LICENSE/AnonymousPro/OFL-FAQ.txt +235 -0
  23. data/font/65001/LICENSE/AnonymousPro/OFL.txt +94 -0
  24. data/font/65001/LICENSE/AnonymousPro/README.txt +55 -0
  25. data/font/850/ProFont-Bold-01/LICENSE +22 -0
  26. data/font/850/ProFont-Bold-01/readme.txt +28 -0
  27. data/font/850/ProFontWindows-Bold.ttf +0 -0
  28. data/font/850/ProFontWindows.ttf +0 -0
  29. data/font/850/Profont/LICENSE +22 -0
  30. data/font/850/Profont/readme.txt +31 -0
  31. data/font/932/LICENSE/README-ttf.txt +213 -0
  32. data/font/932/mona.ttf +0 -0
  33. data/lib/distorted.rb +2 -0
  34. data/lib/distorted/checking_you_out.rb +219 -0
  35. data/lib/distorted/checking_you_out/README +4 -0
  36. data/lib/distorted/checking_you_out/application.yaml +33 -0
  37. data/lib/distorted/checking_you_out/font.yaml +29 -0
  38. data/lib/distorted/checking_you_out/image.yaml +108 -0
  39. data/lib/distorted/click_again.rb +333 -0
  40. data/lib/distorted/element_of_media.rb +2 -0
  41. data/lib/distorted/element_of_media/change.rb +119 -0
  42. data/lib/distorted/element_of_media/compound.rb +120 -0
  43. data/lib/distorted/error_code.rb +51 -0
  44. data/lib/distorted/floor.rb +17 -0
  45. data/lib/distorted/invoker.rb +97 -0
  46. data/lib/distorted/media_molecule.rb +58 -0
  47. data/lib/distorted/media_molecule/font.rb +195 -0
  48. data/lib/distorted/media_molecule/image.rb +33 -0
  49. data/lib/distorted/media_molecule/pdf.rb +44 -0
  50. data/lib/distorted/media_molecule/svg.rb +45 -0
  51. data/lib/distorted/media_molecule/text.rb +203 -0
  52. data/lib/distorted/media_molecule/video.rb +18 -0
  53. data/lib/distorted/modular_technology/gstreamer.rb +174 -0
  54. data/lib/distorted/modular_technology/pango.rb +90 -0
  55. data/lib/distorted/modular_technology/ttfunk.rb +48 -0
  56. data/lib/distorted/modular_technology/vips.rb +17 -0
  57. data/lib/distorted/modular_technology/vips/foreign.rb +489 -0
  58. data/lib/distorted/modular_technology/vips/load.rb +133 -0
  59. data/lib/distorted/modular_technology/vips/save.rb +161 -0
  60. data/lib/distorted/monkey_business/encoding.rb +317 -0
  61. data/lib/distorted/monkey_business/hash.rb +18 -0
  62. data/lib/distorted/monkey_business/set.rb +15 -0
  63. data/lib/distorted/monkey_business/string.rb +6 -0
  64. data/lib/distorted/triple_counter.rb +52 -0
  65. data/lib/distorted/version.rb +22 -0
  66. data/test/distorted_test.rb +11 -0
  67. data/test/test_helper.rb +4 -0
  68. metadata +130 -20
@@ -0,0 +1,133 @@
1
+
2
+ require 'set'
3
+
4
+ require 'distorted/checking_you_out'
5
+ require 'distorted/modular_technology/vips/foreign'
6
+ require 'distorted/modular_technology/vips/save'
7
+
8
+
9
+ module Cooltrainer; end
10
+ module Cooltrainer::DistorteD; end
11
+ module Cooltrainer::DistorteD::Technology; end
12
+ module Cooltrainer::DistorteD::Technology::Vips::Load
13
+
14
+ # Returns a Set of MIME::Types based on libvips LipsForeignLoad capabilities.
15
+ # NOTE: libvips only declares support (via :get_suffixes) for the "saver" types,
16
+ # but libvips can use additional external libraries for wider media-types support, e.g.:
17
+ #
18
+ # - SVG with librsvg2★ / libcairo. [*]
19
+ # - PDF with PDFium if available, otherwise with libpoppler-glib / libcairo.
20
+ # - OpenEXR/libIlmImf — ILM high dynamic range image format.
21
+ # - maybe more: https://github.com/libvips/libvips/blob/master/configure.ac
22
+ #
23
+ # [FITS]: https://heasarc.gsfc.nasa.gov/docs/heasarc/fits.html
24
+ #
25
+ # [RSVG2]: This is the normal SVG library for the GNOME/GLib world and is
26
+ # probably fine for 95% of use-cases, but I'm pissed off at it because of:
27
+ #
28
+ # - https://gitlab.gnome.org/GNOME/librsvg/-/issues/56
29
+ # - https://gitlab.gnome.org/GNOME/librsvg/-/issues/100
30
+ # - https://gitlab.gnome.org/GNOME/librsvg/-/issues/183
31
+ # - https://gitlab.gnome.org/GNOME/librsvg/-/issues/494
32
+ # - https://bugzilla.gnome.org/show_bug.cgi?id=666477
33
+ # - https://phabricator.wikimedia.org/T35245
34
+ #
35
+ # TLDR: SVG <tspan> elements' [:x, :y, :dy, :dx] attributes can be
36
+ # a space-delimited list of position values for individual
37
+ # characters in the <tspan>, but librsvg2 only supported reading
38
+ # those attributes as a single one-shot numeric value.
39
+ # Documents using this totally-common and totally-in-spec feature
40
+ # rendered incorrectly with librsvg2. Effected <tspan> elements'
41
+ # subsequent children would hug one edge of the rendered output.
42
+ #
43
+ # And wouldn't you know it but the one (1) SVG on my website
44
+ # at the time I built this feature (IIDX-Turntable-parts.svg) used
45
+ # this feature for the double-digit parts diagram labels.
46
+ # I ended up having to edit my input document to just squash the
47
+ # offending <tspan>s down to a single child each.
48
+ # I guess that's semantically more correct in my document since they are
49
+ # numbers like Eleven and not two separate characters like '1 1'
50
+ # but still ugh lol
51
+ #
52
+ # This was finally fixed in 2019 as of librsvg2 version 2.45.91 :)
53
+ # https://gitlab.gnome.org/GNOME/librsvg/-/issues/494#note_579774
54
+ #
55
+ # [MAGICK]: The Magick-based '.bmp' loader is broken/missing in libvips <= 8.9.1,
56
+ # but our automatic Loader detection will handle that. Just FYI :)
57
+ #
58
+ VIPS_LOADERS = Cooltrainer::DistorteD::Technology::Vips::vips_get_types('VipsForeignLoad').keep_if { |t|
59
+ Array[
60
+ t.media_type != 'application'.freeze, # e.g. application/pdf
61
+ t.media_type != 'text'.freeze, # e.g. text/csv
62
+ ].all? && Array[
63
+ t.sub_type.include?('zip'.freeze),
64
+ # Skip declaring SVG here since I want to handle it in a Vector-only Molecule
65
+ # and will re-declare this there. Prolly need to think up a better way to do this.
66
+ t.sub_type.include?('svg'.freeze),
67
+ ].none?
68
+ }
69
+
70
+ # Vips::vips_foreign_find_save is based on filename suffix (extension),
71
+ # but :vips_foreign_find_load seems to be based on file magic.
72
+ # That is, we can't `vips_foreign_find_load` for a made-up filename
73
+ # or plain suffix like we can to to build 'vips/save'::OUTER_LIMITS.
74
+ # This caught me off guard but doesn't *entirely* not-make-sense,
75
+ # considering Vips::Image::new_from_filename calls :vips_foreign_find_load
76
+ # and obviously expects a file to be present.
77
+ #
78
+ ## Example — works with real file and fails with only suffix:
79
+ # irb> Vips::vips_foreign_find_load '/home/okeeblow/cover.jpg'
80
+ # => "VipsForeignLoadJpegFile"
81
+ # irb> Vips::vips_foreign_find_load 'cover.jpg'
82
+ # => nil
83
+ #
84
+ ## Syscalls of successful real-file :vips_foreign_find_load call
85
+ # showing how it works:
86
+ # [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; Vips::vips_foreign_find_load '/home/okeeblow/cover.jpg'" 2>&1|grep cover.jpg
87
+ # access("/home/okeeblow/cover.jpg", R_OK) = 0
88
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
89
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
90
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
91
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
92
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
93
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
94
+ # lstat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
95
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY|O_CLOEXEC) = 5
96
+ # stat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
97
+ # stat("/home/okeeblow/cover.jpg-journal", 0x7fffa70f4df0) = -1 ENOENT (No such file or directory)
98
+ # stat("/home/okeeblow/cover.jpg-wal", 0x7fffa70f4df0) = -1 ENOENT (No such file or directory)
99
+ # stat("/home/okeeblow/cover.jpg", {st_mode=S_IFREG|0740, st_size=6242228, ...}) = 0
100
+ # openat(AT_FDCWD, "/home/okeeblow/cover.jpg", O_RDONLY) = 5
101
+ #
102
+ ## …and of a fake suffix-only filename to show how it doesn't:
103
+ # [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; Vips::vips_foreign_find_load 'fartbutt.jpg'" 2>&1|grep '.jpg'
104
+ # read(5, ".write_to_target target, \".jpg[Q"..., 8192) = 8192
105
+ # access("fartbutt.jpg", R_OK) = -1 ENOENT (No such file or directory)
106
+ #
107
+ ## Versus the corresponding Vips::vips_foreign_find_save which is *only* based
108
+ # on filename suffix and does not try to look at a file at all,
109
+ # perhaps (read: obviously) because that file wouldn't exist yet to test until we save it :)
110
+ # [okeeblow@emi#okeeblow] strace ruby -e "require 'vips'; p Vips::vips_foreign_find_save 'fartbutt.jpg'" 2>&1|grep -E 'Save|.jpg'
111
+ # read(5, ".write_to_target target, \".jpg[Q"..., 8192) = 8192
112
+ # write(1, "\"VipsForeignSaveJpegFile\"\n", 26"VipsForeignSaveJpegFile"
113
+ #
114
+ # For this reason I'm going to write my own shim Loader-finder and use it instead.
115
+ LOWER_WORLD = VIPS_LOADERS.reduce(Hash[]) { |types,type|
116
+ types[type] = Cooltrainer::DistorteD::Technology::Vips::vips_get_options(
117
+ Cooltrainer::DistorteD::Technology::Vips::vips_foreign_find_load_suffix(".#{type.preferred_extension}")
118
+ )
119
+ types
120
+ }
121
+
122
+
123
+ def to_vips_image
124
+ # TODO: Learn more about what VipsAccess means for our use case,
125
+ # if the default should be changed, and if it should be
126
+ # a user-controllable attr or not.
127
+ # https://libvips.github.io/libvips/API/current/VipsImage.html#VipsAccess
128
+ # https://libvips.github.io/libvips/API/current/How-it-opens-files.md.html
129
+ @vips_image ||= Vips::Image.new_from_file(path)
130
+ end
131
+
132
+
133
+ end
@@ -0,0 +1,161 @@
1
+
2
+ # Requiring libvips 8.8 for HEIC/HEIF (moo) support, `justify` support in the
3
+ # Vips::Image text operator, animated WebP support, and more:
4
+ # https://libvips.github.io/libvips/2019/04/22/What's-new-in-8.8.html
5
+
6
+ require 'distorted/element_of_media'
7
+ require 'distorted/triple_counter'
8
+ VIPS_MINIMUM_VER = TripleCounter.new(8, 8, 0)
9
+
10
+ # Tell the user to install the shared library if it's missing.
11
+ begin
12
+ require 'vips'
13
+ VIPS_AVAILABLE_VER = TripleCounter.new(Vips::version(0), Vips::version(1), Vips::version(2))
14
+
15
+ unless VIPS_AVAILABLE_VER >= VIPS_MINIMUM_VER
16
+ raise LoadError.new(
17
+ "DistorteD needs libvips #{VIPS_MINIMUM_VER}, but the available version is '#{Vips::version_string}'"
18
+ )
19
+ end
20
+
21
+ rescue LoadError => le
22
+ # Only match libvips.so load failure
23
+ raise unless le.message =~ /libvips.so/
24
+
25
+ # Multiple OS help
26
+ help = <<~INSTALL
27
+
28
+ Please install the VIPS (libvips) image processing library, version #{VIPS_MINIMUM_VER} or later.
29
+
30
+ FreeBSD:
31
+ pkg install graphics/vips
32
+
33
+ macOS:
34
+ brew install vips
35
+
36
+ Debian/Ubuntu/Mint:
37
+ apt install libvips libvips-dev
38
+ INSTALL
39
+
40
+ # Re-raise with install message
41
+ raise $!, "#{help}\n#{$!}", $!.backtrace
42
+ end
43
+
44
+
45
+ require 'set'
46
+
47
+ require 'distorted/checking_you_out'
48
+
49
+
50
+ module Cooltrainer; end
51
+ module Cooltrainer::DistorteD; end
52
+ module Cooltrainer::DistorteD::Technology; end
53
+ module Cooltrainer::DistorteD::Technology::Vips::Save
54
+
55
+
56
+ # There is one (only one) native libvips image format, with file extname `.vips`.
57
+ # As I write this—running libvips 8.8—the :get_suffixes function does not include
58
+ # its own '.vips' as a supported extension.
59
+ # There also (as of mid 2020) seems to be no official media-type assigned
60
+ # for VIPS format, so I am going to make one up in CHECKING::YOU::OUT's local-data.
61
+ # - Raw pixel data
62
+ #
63
+ # [RAW]: https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-rawload
64
+ # https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-csvload
65
+ #
66
+ # Most libvips installations, even very minimally-built ones,
67
+ # will almost certainly support a few very common formats:
68
+ # - JPEG with libjpeg.
69
+ # - PNG with libpng.
70
+ # - GIF with giflib.
71
+ # - WebP with libwebp.
72
+ # - TIFF with libtiff.
73
+ #
74
+ # Normal libvips installations probably also support many less-mainstream formats:
75
+ # - HEIF/HEIC with libheif.
76
+ # - ICC profiles with liblcms2.
77
+ # - Matlab with matio/libhdf5.
78
+ # - FITS★ with cfitsio.
79
+ # - Styled text with Pango/ft2.
80
+ # - Saving GIF/BMP with Magick.
81
+ # NOTE that GIFs are *loaded* using giflib,
82
+ # and that BMP loading is unsupported.
83
+ # - Various simple ASCII/binary-based formats with libgsf★
84
+ # · Comma-separated values
85
+ # · Netpbm★
86
+ # · VIPS (non-Matlab) matrices★
87
+ #
88
+ # [NETPBM]: https://en.wikipedia.org/wiki/Netpbm#File_formats
89
+ # [LIBGSF]: https://developer.gnome.org/gsf/
90
+ # [MATRIX]: https://libvips.github.io/libvips/API/current/VipsForeignSave.html#vips-matrixload
91
+
92
+ # Vips allows us to query supported *SAVE* types by suffix.
93
+ # There's a simple relationship between filetype and extension since
94
+ # libvips uses the suffix to pick the Saver module.
95
+ # https://libvips.github.io/libvips/API/current/VipsForeignSave.html
96
+ #
97
+ # Loader modules, on the other hand, are picked by sniffing the
98
+ # first few bytes of the file, so a list of file extensions for
99
+ # supported loadable formats won't always be complete.
100
+ # For example, SVG and PDF are usually supported as loaders
101
+ # (via rsvg and PDFium/Poppler)
102
+ # https://github.com/libvips/ruby-vips/issues/186
103
+ #
104
+ # irb(main)> Vips.get_suffixes
105
+ # => [".csv", ".mat", ".v", ".vips", ".ppm", ".pgm", ".pbm", ".pfm",
106
+ # ".hdr", ".dz", ".png", ".jpg", ".jpeg", ".jpe", ".webp", ".tif",
107
+ # ".tiff", ".fits", ".fit", ".fts", ".gif", ".bmp"]
108
+ VIPS_SAVERS = Vips.get_suffixes.map{ |t|
109
+ # A single call to this will return a Set of MIME::Types for a String input
110
+ CHECKING::YOU::OUT(t)
111
+ }.reduce { |c,t|
112
+ # Flatten the Set-of-Sets-of-Types into a Set-of-Types
113
+ (c || Set[]).merge(t)
114
+ }.keep_if { |t|
115
+ # Filter out any of libvips' supported output Types that aren't
116
+ # actually images (e.g. CSV)
117
+ t.media_type == 'image'
118
+ }
119
+
120
+ OUTER_LIMITS = VIPS_SAVERS.reduce(Hash[]) { |types,type|
121
+ types[type] = Cooltrainer::DistorteD::Technology::Vips::vips_get_options(
122
+ Vips::vips_foreign_find_save(".#{type.preferred_extension}")
123
+ )
124
+ types
125
+ }
126
+
127
+ # Define a to_<mediatype>_<subtype> method for each MIME::Type supported by libvips,
128
+ # e.g. a supported Type 'image/png' will define a method :to_image_png in any
129
+ # context where this module is included.
130
+ self::OUTER_LIMITS.each_key { |t|
131
+ define_method(t.distorted_file_method) { |dest_root, change|
132
+ vips_save(dest_root, change)
133
+ }
134
+ }
135
+
136
+ protected
137
+
138
+ # Generic Vips saver method, optionally handling resizing and cropping.
139
+ # NOTE: libvips chooses a saver (internally) based on the extname of the destination path.
140
+ # TODO: String-buffer version of this method using e.g. Image#jpegsave_buffer
141
+ def vips_save(dest_root, change)
142
+ begin
143
+ to_vips_image.write_to_file(change.paths(dest_root).first)
144
+ change.breaks.each { |b|
145
+ ver = to_vips_image.thumbnail_image(
146
+ b.to_int,
147
+ **{:crop => change.crop || :none},
148
+ )
149
+ ver.write_to_file(change.path(dest_root, b))
150
+ }
151
+ rescue Vips::Error => v
152
+ if v.message.include?('No known saver')
153
+ # TODO: Handle missing output formats. Replacements? Skip it? Die?
154
+ return nil
155
+ else
156
+ raise
157
+ end
158
+ end
159
+ end # save
160
+
161
+ end
@@ -0,0 +1,317 @@
1
+ require 'set'
2
+
3
+ ## Adds better support to Ruby's Encoding class for IBM/Microsoft-style numeric codepage IDs:
4
+ # - Adds a `:code_page` method on any instance of Encoding, returning the Integer codepage ID.
5
+ # - Adds a `:page_code` singleton method on :Encoding, returning the Encoding instance for any Integer codepage ID.
6
+ # - Patches Encoding::find() to add Integer and numeric-String find() support using :page_code.
7
+ # - Adds a `:code_page_orphans` singleton method on :Encoding, returning a Set of built-in Encodings that
8
+ # do not have a corresponding built-in `:CP<####>` name/constant.
9
+ # - Includes many additional numeric codepage IDs based on information
10
+ # from IANA, Unicode Consortium, OS vendors, and some of my own speculation.
11
+
12
+ ## This is similar in effect (but not in implementation) to the 'encoding-codepage' Gem:
13
+ # https://github.com/ConradIrwin/encoding-codepage
14
+ # My choice of method name `:code_page` was intentional to avoid conflict with this Gem's `:codepage`.
15
+
16
+
17
+ ## Notes-To-Self about Encoding (the class I'm patching) and encodings in general:
18
+
19
+ ## Unicode Normalization Forms
20
+ #
21
+ # - Canonical composed (NFC) and decomposed (NFD) forms.
22
+ # - Non-canonical composed (NFKC) and decomposed (NFKD) forms.
23
+ #
24
+ # "For example, form C uses the single Unicode code point "Ä" (U+00C4),
25
+ # while form D uses ("A" + "¨", that is U+0041 U+0308).
26
+ # These render identically, because "¨" (U+0308) is a combining character."
27
+ #
28
+ # http://www.unicode.org/faq/normalization.html
29
+ # http://www.unicode.org/reports/tr15/
30
+ # https://docs.microsoft.com/en-us/windows/win32/intl/using-unicode-normalization-to-represent-strings
31
+ # https://en.wikipedia.org/wiki/Precomposed_character
32
+ # https://en.wikipedia.org/wiki/Unicode_equivalence
33
+ #
34
+ # HFS+ is a notable outlier among filesystems by requiring decomposed form (actually 'UTF-8-Mac' variant).
35
+
36
+ ## Ruby includes a lot of the desired codepoint ID data built-in,
37
+ # but in the form of String alias names for Encoding instances,
38
+ # e.g. KOI8-R is also codepage 878:
39
+ #
40
+ # irb> Encoding::KOI8_R.names
41
+ # => ["KOI8-R", "CP878"]
42
+ #
43
+ # irb> Encoding::KOI8_R.names.any?{ |n| n =~ /^(CP|IBM|Windows[-_])(?<code_page>\d{3,}$)/ }
44
+ # => true
45
+ # irb> Regexp.last_match
46
+ # => #<MatchData "CP878" code_page:"878">
47
+ #
48
+ # My code defers to this built-in data where possible instead of doing
49
+ # a complete import of the Microsoft identifiers like the Gem.
50
+
51
+
52
+ ## Some encodings have both generic and vendor-prefixed names,
53
+ ## and some are canonically one or the other, e.g.:
54
+ #
55
+ # irb> Encoding::IBM437
56
+ # => #<Encoding:IBM437>
57
+ # irb> Encoding::CP437
58
+ # => #<Encoding:IBM437>
59
+ #
60
+ # irb> Encoding::IBM850
61
+ # => #<Encoding:CP850>
62
+ # irb> Encoding::CP850
63
+ # => #<Encoding:CP850>
64
+
65
+
66
+ class Encoding
67
+
68
+ # Define a Regexp to match and extract Ruby's built-in numeric codepage IDs
69
+ # from thir Encoding's names.
70
+ #
71
+ # Using IGNORECASE to handle the duplicate differing-capitalization constants,
72
+ # e.g. Encoding::WINDOWS_31J and Encoding::Windows_31J both exist and are equivalent.
73
+ #
74
+ # Worth mentioning since this file deals with Encoding,
75
+ # but the Regexp itself also has an internal Encoding that can be changed
76
+ # if I had any reason to (I don't):
77
+ # https://ruby-doc.org/core/Regexp.html#class-Regexp-label-Encoding
78
+ CODE_PAGE_ENCODING_NAME = Regexp.new('^(CP|IBM|Windows[-_])(?<code_page>\d{3,}$)', Regexp::IGNORECASE)
79
+
80
+ # Data sources:
81
+ # https://www.aivosto.com/articles/charsets-codepages.html
82
+ # https://developer.apple.com/documentation/coreservices/1400434-ms-dos_and_windows_text_encodings
83
+ # https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
84
+ # https://en.wikipedia.org/wiki/CCSID
85
+ # https://github.com/SheetJS/js-codepage/blob/master/codepage.md
86
+ ADDITIONAL_ENCODING_CODE_PAGE_IDS = {
87
+
88
+ # Burgerland :911:
89
+ Encoding::US_ASCII => 20127,
90
+
91
+ # Unicode
92
+ Encoding::UTF_16LE => 1200,
93
+ Encoding::UTF_16BE => 1201,
94
+ Encoding::UTF_32LE => 12000,
95
+ Encoding::UTF_32BE => 12001,
96
+
97
+ ## 245
98
+ #
99
+ # Code Page 932 is Windows-31J, but I want to provide fallback mapping
100
+ # between 932 and Shift_JIS to handle detected-text or `encoding` arguments
101
+ # that return Shift_JIS since that naming is much much more well-known than 31J.
102
+ Encoding::SHIFT_JIS => 932,
103
+ # https://referencesource.microsoft.com/#mscorlib/system/text/eucjpencoding.cs
104
+ # https://www.redmine.org/issues/29442
105
+ # https://www.sljfaq.org/afaq/encodings.html
106
+ # https://uic.jp/charset/
107
+ # http://www.monyo.com/technical/samba/docs/Japanese-HOWTO-3.0.en.txt
108
+ Encoding::EUC_JP_MS => 20932,
109
+ Encoding::EUC_JP => 51932,
110
+ # Encoding:EUC-JIS-2004 dunno
111
+ #
112
+ # https://www.debian.org/doc/manuals/intro-i18n/ch-coding.en.html 3.2: Stateless and Stateful
113
+ # TL;DR: Stateful uses an escape sequence to switch charset;
114
+ # Stateless have all-unique codepoints.
115
+ # Normal ISO-2022-JP is stateful.
116
+ # "For example, in ISO 2022-JP, two bytes of 0x24 0x2c may mean a Japanese Hiragana character 'が'
117
+ # or two ASCII character of '$' and ',' according to the shift state."
118
+ # Encoding::STATELESS_ISO_2022_JP
119
+ #
120
+ # Mobile operator specific encodings that I have no numeric IDs for rn:
121
+ # Encoding:UTF8-DoCoMo
122
+ # Encoding:SJIS-DoCoMo
123
+ # Encoding:UTF8-KDDI
124
+ # Encoding:SJIS-KDDI
125
+ # Encoding:stateless-ISO-2022-JP-KDDI
126
+ # Encoding:UTF8-SoftBank
127
+ # Encoding:SJIS-SoftBank
128
+
129
+ ## CHY-NAH
130
+ #
131
+ # https://en.wikipedia.org/wiki/Code_page_903
132
+ Encoding::GB1988 => 903,
133
+ #
134
+ ## Hong Kong Supplementary Character Set
135
+ # The Windows version of this seems to be the built-in CP951:
136
+ # https://web.archive.org/web/20160402215421/https://blogs.msdn.microsoft.com/shawnste/2007/03/12/cp-951-hkscs/
137
+ # https://web.archive.org/web/20141129233053/http://www-01.ibm.com/software/globalization/ccsid/ccsid5471.html
138
+ Encoding::BIG5_HKSCS => 5417,
139
+ #
140
+ # The 936 postfix is a reference to the standard Windows Chinese encoding being CP936 / GBK.
141
+ # "GB2312 is the registered internet name for EUC-CN, which is its usual encoded form."
142
+ Encoding::GB2312 => 20936,
143
+ Encoding::GB12345 => 51936,
144
+ #Encoding:GB2312_HZ => 52936, # Doesn't exist in Ruby
145
+ Encoding::GB18030 => 54936,
146
+
147
+
148
+ ## Asia At Odd Hours
149
+ #
150
+ # I always wondered if the "Gravitational Pull of Pepsi" logo came from
151
+ # them wanting it to look less like the Korean flag.
152
+ # The traditional Korean Windows Code Page is CP949, available in Ruby
153
+ # but not under any other name aliases.
154
+ # IBM uses CP1363, not in Ruby.
155
+ Encoding::EUC_KR => 51949,
156
+ #
157
+ # ROC me now
158
+ Encoding::EUC_TW => 51950,
159
+ # Unicode 補完計畫 / Unicode-At-On is a Big5 variant once popular in Taiwan:
160
+ # https://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
161
+ # https://lists.w3.org/Archives/Public/public-html-ig-zh/2012Apr/0061.html
162
+ # Encoding::BIG5_UAO
163
+ #
164
+ # CP950 (available in Ruby) is the code page used on Windows under the name "big5',
165
+ # but I want to map the generic Big5 Encoding to 950 as well to handle
166
+ # detected and specified encodings by that name.
167
+ # "The major difference between Windows code page 950 and "common" (non-vendor-specific) Big5
168
+ # is the incorporation of a subset of the ETEN extensions to Big5 at 0xF9D6 through 0xF9FE
169
+ # (comprising the seven Chinese characters 碁, 銹, 裏, 墻, 恒, 粧, and 嫺,
170
+ # followed by 34 box drawing characters and block elements)."
171
+ Encoding::Big5 => 950,
172
+ #
173
+ # Encoding::TIS_620 is the base Thai 8-bit encoding standard that is apparently
174
+ # never actually used in the wild.
175
+ # ISO-8859-11 is identical to it with the sole exception "that ISO/IEC 8859-11
176
+ # allocates non-breaking space to code 0xA0, while TIS-620 leaves it undefined."
177
+ # "The Microsoft Windows code page 874 as well as the code page used in the
178
+ # Thai version of the Apple Macintosh, MacThai,
179
+ # are variants of TIS-620 — incompatible with each other, however."
180
+
181
+
182
+ # Eastern Yurp
183
+ #Encoding::KOI8_R => 20866,
184
+ Encoding::KOI8_U => 21866,
185
+
186
+ ## ISO/IEC 8859 (8-bit) encoding family
187
+ #
188
+ Encoding::ISO_8859_1 => 28591, # West European languages (Latin-1)
189
+ Encoding::ISO_8859_2 => 28592, # Central and East European languages (Latin-2)
190
+ Encoding::ISO_8859_3 => 28593, # Southeast European and miscellaneous languages (Latin-3)
191
+ Encoding::ISO_8859_4 => 28594, # Scandinavian/Baltic languages (Latin-4)
192
+ Encoding::ISO_8859_5 => 28595, # Latin/Cyrillic
193
+ Encoding::ISO_8859_6 => 28596, # Latin/Arabic
194
+ Encoding::ISO_8859_7 => 28597, # Latin/Greek
195
+ Encoding::ISO_8859_8 => 28598, # Latin/Hebrew
196
+ Encoding::ISO_8859_9 => 28599, # Latin-1 modification for Turkish (Latin-5)
197
+ #
198
+ # ISO-8859-10 covers Nordic languages better than ISO_8859_4.
199
+ # Wikipedia says this has been assigned in Windows as 28600 even though Microsoft's
200
+ # page doesn't list it now in 2020, but w/e.
201
+ # IBM assigned it as CP919.
202
+ Encoding::ISO_8859_10 => 28600, # Lappish/Nordic/Eskimo languages (Latin-6)
203
+ #
204
+ # Wikipedia says this is assigned, but same deal.
205
+ Encoding::ISO_8859_11 => 28601, # Latin/Thai
206
+ #
207
+ # Intended Celtic encoding abandoned in 1997 in favor of ISO_8859_14:
208
+ # Encoding::ISO_8859_12 => 28602,
209
+ #
210
+ Encoding::ISO_8859_13 => 28603, # Baltic Rim languages (Latin-7)
211
+ Encoding::ISO_8859_14 => 28604, # Celtic (Latin-8)
212
+ Encoding::ISO_8859_15 => 28605, # West European languages (Latin-9)
213
+ Encoding::ISO_8859_16 => 28606, # Romanian (Latin-10)
214
+
215
+ # Apple encodings
216
+ #
217
+ # UTF8_MAC is the encoding Mac OS X uses on HFS+ filesystems and is a variant of UTF-8-NFD.
218
+ # https://web.archive.org/web/20140812023313/http://developer.apple.com/library/ios/documentation/MacOSX/Conceptual/BPInternational/Articles/FileEncodings.html
219
+ # "Mac OS Extended (HFS+) uses canonically decomposed Unicode 3.2 in UTF-16 format,
220
+ # which consists of a sequence of 16-bit codes.
221
+ # (Characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D are not decomposed.)"
222
+ #
223
+ # There isn't a good Microsoft-style ID I can assign to it, so this is just FYI.
224
+
225
+ # Classic Mac encodings
226
+ #
227
+ # https://en.wikipedia.org/wiki/Category:Mac_OS_character_encodings
228
+ # http://mirror.informatimago.com/next/developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.1b.html
229
+ #
230
+ # MacRoman pre-OS-8.5 has the "Universal currency symbol" at 0xDB,
231
+ # while 8.5 and later replace it with the (then-new) Euro symbol:
232
+ # https://en.wikipedia.org/wiki/Currency_sign_(typography)
233
+ Encoding::MACROMAN => 10000,
234
+ #
235
+ # "Shift-JIS with JIS Roman modifications, extra 1-byte characters, 2-byte Apple extensions,
236
+ # and some vertical presentation forms in the range 0xEB40--0xEDFE ("ku plus 84")."
237
+ # Ruby also defines Encoding::MACJAPAN but it's the same Encoding.
238
+ Encoding::MACJAPANESE => 10001,
239
+ #
240
+ # The following encodings are not defined in Ruby's Encoding class,
241
+ # but I'm listing them here for completeness' sake.
242
+ # MACCHINESETRAD => 10002,
243
+ # MACKOREAN => 10003,
244
+ # MACARABIC => 10004,
245
+ # MACHEBREW => 10005,
246
+ # MACGREEK => 10006,
247
+ # MACCYRILLIC => 10007,
248
+ # MACHINESESIMP => 10008,
249
+ #
250
+ # Unlike MacJapan/MacJapanese, MacRomania is something different than MacRoman.
251
+ Encoding::MACROMANIA => 10010,
252
+ #
253
+ Encoding::MACUKRAINE => 10017,
254
+ Encoding::MACTHAI => 10021,
255
+ Encoding::MACCENTEURO => 10029,
256
+ Encoding::MACICELAND => 10079,
257
+ Encoding::MACTURKISH => 10081,
258
+ Encoding::MACCROATIAN => 10082,
259
+
260
+ } # ADDITIONAL_ENCODING_CODE_PAGE_IDS
261
+
262
+ # Returns a Hash of the built-in-orphan Encodings we now have codepage IDs for,
263
+ # e.g. {#<Encoding:US-ASCII>=>20127, #<Encoding:UTF-16BE>=>1201, #<Encoding:UTF-16LE>=>1200}
264
+ def self.adopted_encoding_code_page_ids
265
+ @@adopted_encoding_code_page_ids ||= self::code_page_orphans.select{ |e|
266
+ if self::ADDITIONAL_ENCODING_CODE_PAGE_IDS.has_key?(e)
267
+ # irb> Encoding.const_defined?('CP932')
268
+ # => true
269
+ not Encoding::const_defined?("CP#{self::ADDITIONAL_ENCODING_CODE_PAGE_IDS[e]}")
270
+ else
271
+ false
272
+ end
273
+ }.map{ |e|
274
+ [e, self::ADDITIONAL_ENCODING_CODE_PAGE_IDS[e]]
275
+ }.to_h
276
+ end
277
+
278
+ # Returns a Set of built-in Encodings whose :names /!\ DO NOT /!\ contain a usable
279
+ # numeric codepage ID, as matched by our Regexp.
280
+ def self.code_page_orphans
281
+ Encoding.list.select{ |c|
282
+ c.respond_to?(:names) ? (not c.names.any?{|n| CODE_PAGE_ENCODING_NAME.match(n)}) : false
283
+ }.to_set
284
+ end
285
+
286
+ # Returns the Encoding instance of any Integer codepage ID.
287
+ def self.page_code(code_page_id)
288
+ # Every canonically-Windows*/IBM*-named Encoding seems to also have a 'CP<whatever>' equivalent.
289
+ Encoding::find("CP#{code_page_id}") rescue nil
290
+ end
291
+
292
+ # Returns the Integer codepage ID of any Encoding instance.
293
+ def code_page
294
+ Encoding::adopted_encoding_code_page_ids.dig(self) ||
295
+ self.names.any?{ |n| CODE_PAGE_ENCODING_NAME.match(n) } ?
296
+ Regexp.last_match['code_page'.freeze].to_i : nil
297
+ end
298
+
299
+ # Patch the Encoding::find() method to support taking Integer and numeric-String arguments
300
+ # in addition to the Symbol and canonical-String args it usually supports.
301
+ find_you_again = singleton_method(:find)
302
+ define_singleton_method(:find) do |code_page_id|
303
+ begin
304
+ if code_page_id.is_a?(Integer)
305
+ Encoding::page_code(code_page_id)
306
+ elsif code_page_id.to_i > 0
307
+ # String#to_i returns 0 for any non-entirely-numeric String
308
+ Encoding::page_code(code_page_id.to_i)
309
+ else
310
+ find_you_again.(code_page_id)
311
+ end
312
+ rescue RuntimeError => e
313
+ find_you_again.(code_page_id)
314
+ end
315
+ end
316
+
317
+ end