RubyGems - static_holmes - Versions diffs - 0.7.7 - Mend

static_holmes 0.7.7

Files changed (12) hide show

checksums.yaml +7 -0
data/ext/charlock_holmes/common.h +41 -0
data/ext/charlock_holmes/converter.c +57 -0
data/ext/charlock_holmes/encoding_detector.c +377 -0
data/ext/charlock_holmes/ext.c +15 -0
data/ext/charlock_holmes/extconf.rb +102 -0
data/ext/charlock_holmes/transliterator.cpp +130 -0
data/lib/charlock_holmes/encoding_detector.rb +76 -0
data/lib/charlock_holmes/string.rb +34 -0
data/lib/charlock_holmes/version.rb +3 -0
data/lib/charlock_holmes.rb +6 -0
metadata +98 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 1e35947554b465d48dc970a60031dcca03df441bd829b4464dba2ecf0bc792bb
+  data.tar.gz: 4cdfaff28364c07fc96a0a77a62d2fb5dbccc130859a4268065084e5abdff449
+SHA512:
+  metadata.gz: 3007aa5b2d53c6046dc65086cb74881ea10f48fda3de2c89a053a42a820f5045b20ce2b716ce9ac0d51f3e6b704a85e9659958cf70c9631e74593637dfbae486
+  data.tar.gz: 13ad7377a3525419518ef4c20ce6c4763a8fc9e8b5cabaa30495d5d79cb8bf0b239e2238a4dcea5bc181572cec4356be3c2d65bf599c440a74a9f398881a6ec5

data/ext/charlock_holmes/common.h ADDED Viewed

@@ -0,0 +1,41 @@
+#ifndef CHARLOCK_COMMON_H
+#define CHARLOCK_COMMON_H
+// tell rbx not to use it's caching compat layer
+// by doing this we're making a promize to RBX that
+// we'll never modify the pointers we get back from RSTRING_PTR
+#define RSTRING_NOT_MODIFIED
+#include <ruby.h>
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+#endif
+static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
+#else
+	return rb_str_new(str, len);
+#endif
+}
+static inline VALUE charlock_new_str(const char *str, size_t len)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
+#else
+	return rb_str_new(str, len);
+#endif
+}
+static inline VALUE charlock_new_str2(const char *str)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
+#else
+	return rb_str_new2(str);
+#endif
+}
+#endif

data/ext/charlock_holmes/converter.c ADDED Viewed

@@ -0,0 +1,57 @@
+#include "unicode/ucnv.h"
+#include "common.h"
+extern VALUE rb_mCharlockHolmes;
+static VALUE rb_cConverter;
+static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
+	VALUE rb_out;
+	const char *src_enc;
+	const char *dst_enc;
+	const char *src_txt;
+	char *out_buf;
+	void *rb_enc = NULL;
+	int32_t src_len;
+	int32_t out_len;
+	UErrorCode status = U_ZERO_ERROR;
+	Check_Type(rb_txt, T_STRING);
+	Check_Type(rb_src_enc, T_STRING);
+	Check_Type(rb_dst_enc, T_STRING);
+	src_txt = RSTRING_PTR(rb_txt);
+	src_len = RSTRING_LEN(rb_txt);
+	src_enc = RSTRING_PTR(rb_src_enc);
+	dst_enc = RSTRING_PTR(rb_dst_enc);
+	// first determin the size of the output buffer
+	out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
+	if (status != U_BUFFER_OVERFLOW_ERROR) {
+		rb_raise(rb_eArgError, "%s", u_errorName(status));
+	}
+	out_buf = malloc(out_len);
+	// now do the actual conversion
+	status = U_ZERO_ERROR;
+	out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
+	if (U_FAILURE(status)) {
+		free(out_buf);
+		rb_raise(rb_eArgError, "%s", u_errorName(status));
+	}
+#ifdef HAVE_RUBY_ENCODING_H
+	rb_enc = (void *)rb_enc_find(dst_enc);
+#endif
+	rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
+	free(out_buf);
+	return rb_out;
+}
+void _init_charlock_converter() {
+	rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
+	rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
+}

data/ext/charlock_holmes/encoding_detector.c ADDED Viewed

@@ -0,0 +1,377 @@
+#include "unicode/ucsdet.h"
+#include "common.h"
+extern VALUE rb_mCharlockHolmes;
+static VALUE rb_cEncodingDetector;
+typedef struct {
+	UCharsetDetector *csd;
+} charlock_detector_t;
+static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	const char *mname;
+	const char *mlang;
+	int mconfidence;
+	VALUE rb_match;
+	VALUE enc_tbl;
+	VALUE enc_name;
+	VALUE compat_enc;
+	if (!match)
+		return Qnil;
+	mname = ucsdet_getName(match, &status);
+	mlang = ucsdet_getLanguage(match, &status);
+	mconfidence = ucsdet_getConfidence(match, &status);
+	rb_match = rb_hash_new();
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
+	enc_name = charlock_new_str2(mname);
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
+	enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
+	compat_enc = rb_hash_aref(enc_tbl, enc_name);
+	if (!NIL_P(compat_enc)) {
+		rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
+	}
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
+	if (mlang && mlang[0])
+		rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
+	return rb_match;
+}
+static VALUE rb_encdec_binarymatch() {
+	VALUE rb_match;
+	rb_match = rb_hash_new();
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
+	return rb_match;
+}
+static int detect_binary_content(VALUE self, VALUE rb_str) {
+	size_t buf_len, scan_len;
+	const char *buf;
+	buf = RSTRING_PTR(rb_str);
+	buf_len = RSTRING_LEN(rb_str);
+	scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
+	if (buf_len > 10) {
+		// application/postscript
+		if (!memcmp(buf, "%!PS-Adobe-", 11))
+			return 0;
+	}
+	if (buf_len > 7) {
+		// image/png
+		if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
+			return 1;
+	}
+	if (buf_len > 5) {
+		// image/gif
+		if (!memcmp(buf, "GIF87a", 6))
+			return 1;
+		// image/gif
+		if (!memcmp(buf, "GIF89a", 6))
+			return 1;
+	}
+	if (buf_len > 4) {
+		// application/pdf
+		if (!memcmp(buf, "%PDF-", 5))
+			return 1;
+	}
+	if (buf_len > 3) {
+		// UTF-32BE
+		if (!memcmp(buf, "\0\0\xfe\xff", 4))
+			return 0;
+		// UTF-32LE
+		if (!memcmp(buf, "\xff\xfe\0\0", 4))
+			return 0;
+	}
+	if (buf_len > 2) {
+		// image/jpeg
+		if (!memcmp(buf, "\xFF\xD8\xFF", 3))
+			return 1;
+	}
+	if (buf_len > 1) {
+		// UTF-16BE
+		if (!memcmp(buf, "\xfe\xff", 2))
+			return 0;
+		// UTF-16LE
+		if (!memcmp(buf, "\xff\xfe", 2))
+			return 0;
+	}
+	/*
+	 * If we got this far, any NULL bytes within the `scan_len`
+	 * range will likely mean the contents are binary.
+	 */
+	if (scan_len < buf_len)
+		buf_len = scan_len;
+	return !!memchr(buf, 0, buf_len);
+}
+/*
+ * call-seq: true/false = EncodingDetector.is_binary? str
+ *
+ * Attempt to detect if a string is binary or text
+ *
+ * str      - a String, what you want to perform the binary check on
+ *
+ * Returns: true or false
+ */
+static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
+{
+	if (detect_binary_content(self, str))
+		return Qtrue;
+	else
+		return Qfalse;
+}
+/*
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
+ *
+ * Attempt to detect the encoding of this string
+ *
+ * str      - a String, what you want to detect the encoding of
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
+ *            be used as an additional hint to the charset detector
+ *
+ * Returns: a Hash with :encoding, :language, :type and :confidence
+ */
+static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	charlock_detector_t *detector;
+	VALUE rb_str;
+	VALUE rb_enc_hint;
+	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
+	Check_Type(rb_str, T_STRING);
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	// first lets see if this is binary content
+	if (detect_binary_content(self, rb_str)) {
+		return rb_encdec_binarymatch();
+	}
+	// if we got here - the data doesn't look like binary
+	// lets try to figure out what encoding the text is in
+	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
+	if (!NIL_P(rb_enc_hint)) {
+		Check_Type(rb_enc_hint, T_STRING);
+		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
+	}
+	return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
+}
+/*
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
+ *
+ * Attempt to detect the encoding of this string, and return
+ * a list with all the possible encodings that match it.
+ *
+ *
+ * str      - a String, what you want to detect the encoding of
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
+ *            be used as an additional hint to the charset detector
+ *
+ * Returns: an Array with zero or more Hashes,
+ *          each one of them with with :encoding, :language, :type and :confidence
+ */
+static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	charlock_detector_t *detector;
+	const UCharsetMatch **csm;
+	VALUE rb_ret;
+	int i, match_count;
+	VALUE rb_str;
+	VALUE rb_enc_hint;
+	VALUE binary_match;
+	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
+	Check_Type(rb_str, T_STRING);
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	rb_ret = rb_ary_new();
+	// first lets see if this is binary content
+	binary_match = Qnil;
+	if (detect_binary_content(self, rb_str)) {
+		binary_match = rb_encdec_binarymatch();
+	}
+	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
+	if (!NIL_P(rb_enc_hint)) {
+		Check_Type(rb_enc_hint, T_STRING);
+		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
+	}
+	csm = ucsdet_detectAll(detector->csd, &match_count, &status);
+	for (i = 0; i < match_count; ++i) {
+		rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
+	}
+	if (!NIL_P(binary_match))
+		rb_ary_unshift(rb_ret, binary_match);
+	return rb_ret;
+}
+/*
+ * call-seq: EncodingDetector#strip_tags?
+ *
+ * Returns whether or not the strip_tags flag is set on this detector
+ *
+ * Returns: Boolean
+ */
+static VALUE rb_get_strip_tags(VALUE self)
+{
+	charlock_detector_t *detector;
+	UBool val;
+	VALUE rb_val;
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	val = ucsdet_isInputFilterEnabled(detector->csd);
+	rb_val = val == 1 ? Qtrue : Qfalse;
+	return rb_val;
+}
+/*
+ * call-seq: EncodingDetector#strip_tags = true
+ *
+ * Enable or disable the stripping of HTML/XML tags from the input before
+ * attempting any detection
+ *
+ * Returns: Boolean, the value passed
+ */
+static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
+{
+	charlock_detector_t *detector;
+	UBool val;
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	val = rb_val == Qtrue ? 1 : 0;
+	ucsdet_enableInputFilter(detector->csd, val);
+	return rb_val;
+}
+/*
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
+ *
+ * The list of detectable encodings supported by this library
+ *
+ * Returns: an Array of Strings
+ */
+static VALUE rb_get_supported_encodings(VALUE klass)
+{
+	UCharsetDetector *csd;
+	UErrorCode status = U_ZERO_ERROR;
+	UEnumeration *encoding_list;
+	VALUE rb_encoding_list;
+	int32_t enc_count;
+	int32_t i;
+	const char *enc_name;
+	int32_t enc_name_len;
+	rb_encoding_list = rb_iv_get(klass, "encoding_list");
+	// lazily populate the list
+	if (NIL_P(rb_encoding_list)) {
+		csd = ucsdet_open(&status);
+		encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
+		rb_encoding_list = rb_ary_new();
+		enc_count = uenum_count(encoding_list, &status);
+		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
+		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
+		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
+		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
+		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
+		for(i=0; i < enc_count; i++) {
+			enc_name = uenum_next(encoding_list, &enc_name_len, &status);
+			rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
+		}
+		rb_iv_set(klass, "encoding_list", rb_encoding_list);
+		ucsdet_close(csd);
+	}
+	return rb_encoding_list;
+}
+static void rb_encdec__free(void *obj)
+{
+	charlock_detector_t *detector;
+	detector = (charlock_detector_t *)obj;
+	if (detector->csd)
+		ucsdet_close(detector->csd);
+	free(detector);
+}
+static VALUE rb_encdec__alloc(VALUE klass)
+{
+	charlock_detector_t *detector;
+	UErrorCode status = U_ZERO_ERROR;
+	VALUE obj;
+	detector = calloc(1, sizeof(charlock_detector_t));
+	obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
+	detector->csd = ucsdet_open(&status);
+	if (U_FAILURE(status)) {
+		rb_raise(rb_eStandardError, "%s", u_errorName(status));
+	}
+	return obj;
+}
+void _init_charlock_encoding_detector()
+{
+	rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
+	rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
+	rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
+	rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
+	rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
+	rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
+	rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
+	rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
+}

data/ext/charlock_holmes/ext.c ADDED Viewed

@@ -0,0 +1,15 @@
+#include "common.h"
+extern void _init_charlock_encoding_detector();
+extern void _init_charlock_converter();
+extern void _init_charlock_transliterator();
+VALUE rb_mCharlockHolmes;
+void Init_charlock_holmes() {
+	rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
+	_init_charlock_encoding_detector();
+	_init_charlock_converter();
+	_init_charlock_transliterator();
+}

data/ext/charlock_holmes/extconf.rb ADDED Viewed

@@ -0,0 +1,102 @@
+require 'mkmf'
+if `which make`.strip.empty?
+  STDERR.puts "\n\n"
+  STDERR.puts "***************************************************************************************"
+  STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
+  STDERR.puts "***************************************************************************************"
+  exit(1)
+end
+##
+# ICU dependency
+#
+ldflags = cppflags = nil
+if RbConfig::CONFIG["host_os"] =~ /darwin/
+  begin
+    brew_prefix = `brew --prefix icu4c`.chomp
+    ldflags   = "#{brew_prefix}/lib"
+    cppflags  = "#{brew_prefix}/include"
+    pkg_conf  = "#{brew_prefix}/lib/pkgconfig"
+    # pkg_config should be less error prone than parsing compiler
+    # commandline options, but we need to set default ldflags and cpp flags
+    # in case the user doesn't have pkg-config installed
+    ENV['PKG_CONFIG_PATH'] ||= pkg_conf
+  rescue
+  end
+end
+dir_config 'icu', cppflags, ldflags
+pkg_config("icu-i18n")
+pkg_config("icu-io")
+pkg_config("icu-uc")
+$CXXFLAGS << ' -std=c++11' unless $CXXFLAGS.include?("-std=")
+unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
+  STDERR.puts "\n\n"
+  STDERR.puts "***************************************************************************************"
+  STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
+  STDERR.puts "***************************************************************************************"
+  exit(1)
+end
+have_library 'z' or abort 'libz missing'
+have_library 'icuuc' or abort 'libicuuc missing'
+have_library 'icudata' or abort 'libicudata missing'
+$CFLAGS << ' -Wall -funroll-loops'
+$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
+def libflag_to_filename(ldflag)
+  case ldflag
+  when /\A-l(.+)/
+    "lib#{Regexp.last_match(1)}.#{$LIBEXT}"
+  end
+end
+def resolve_static_library(libflag, dirs)
+  filename = libflag_to_filename(libflag)
+  dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
+  raise "Unable to find #{filename} in #{dirs}" unless dir
+  File.join(dir, filename)
+end
+def substitute_static_libs(packages)
+  # First, find all the -l<lib> flags added by pkg-config. We want to drop
+  # these dynamically linked libraries and substitute them with the static libraries.
+  libflags = packages.map do |pkg|
+    pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
+  end.flatten.uniq
+  # To find where the static libraries live, we need to search the
+  # library paths given by the -L flag from pkg-config.
+  lib_paths = packages.map do |pkg|
+    include_path = pkg_config(pkg, 'libs-only-L')&.strip
+    include_path&.split(' ').map { |lib| lib.gsub(/^-L/, '') }
+  end.flatten.uniq
+  # Drop the -l<lib> flags and add in the static libraries.
+  new_libs = $libs.shellsplit
+  new_libs.reject! { |arg| libflags.include?(arg) }
+  libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
+  $libs = new_libs.uniq.shelljoin
+end
+static_p = enable_config('static', false)
+message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
+if static_p
+  $CXXFLAGS << ' -fPIC'
+  ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
+  substitute_static_libs(%w[icu-i18n icu-io icu-uc])
+end
+create_makefile 'charlock_holmes/charlock_holmes'

data/ext/charlock_holmes/transliterator.cpp ADDED Viewed

@@ -0,0 +1,130 @@
+#include "common.h"
+#undef UChar
+#include <string>
+#include <unicode/translit.h>
+extern "C" {
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+static VALUE rb_eEncodingCompatibilityError;
+static void check_utf8_encoding(VALUE str) {
+  static rb_encoding *_cached[3] = {NULL, NULL, NULL};
+  rb_encoding *enc;
+  if (_cached[0] == NULL) {
+    _cached[0] = rb_utf8_encoding();
+    _cached[1] = rb_usascii_encoding();
+    _cached[2] = rb_ascii8bit_encoding();
+  }
+  enc = rb_enc_get(str);
+  if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
+    rb_raise(rb_eEncodingCompatibilityError,
+      "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
+  }
+}
+#else
+static void check_utf8_encoding(VALUE str) {}
+#endif
+extern VALUE rb_mCharlockHolmes;
+static VALUE rb_cTransliterator;
+static VALUE rb_transliterator_id_list(VALUE self) {
+  UErrorCode status = U_ZERO_ERROR;
+  icu::StringEnumeration *id_list;
+  int32_t id_list_size;
+  const char *curr_id;
+  int32_t curr_id_len;
+  VALUE rb_ary;
+  VALUE rb_curr_id;
+  id_list_size = 0;
+  id_list = icu::Transliterator::getAvailableIDs(status);
+  if(!U_SUCCESS(status)) {
+    rb_raise(rb_eArgError, "%s", u_errorName(status));
+  }
+  status = U_ZERO_ERROR;
+  id_list_size = id_list->count(status);
+  if(!U_SUCCESS(status)) {
+    rb_raise(rb_eArgError, "%s", u_errorName(status));
+  }
+  rb_ary = rb_ary_new2(id_list_size);
+  do {
+    curr_id_len = 0;
+    curr_id = id_list->next(&curr_id_len, status);
+    if(!U_SUCCESS(status)) {
+      rb_raise(rb_eArgError, "%s", u_errorName(status));
+    }
+    if (curr_id != NULL) {
+      rb_curr_id = charlock_new_str(curr_id, curr_id_len);
+      rb_ary_push(rb_ary, rb_curr_id);
+    }
+  } while(curr_id != NULL);
+  delete id_list;
+  return rb_ary;
+}
+static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
+  UErrorCode status = U_ZERO_ERROR;
+  UParseError p_error;
+  icu::Transliterator *trans;
+  const char *txt;
+  size_t txt_len;
+  const char *id;
+  size_t id_len;
+  icu::UnicodeString *u_txt;
+  std::string result;
+  VALUE rb_out;
+  Check_Type(rb_txt, T_STRING);
+  Check_Type(rb_id, T_STRING);
+  check_utf8_encoding(rb_txt);
+  check_utf8_encoding(rb_id);
+  txt = RSTRING_PTR(rb_txt);
+  txt_len = RSTRING_LEN(rb_txt);
+  id = RSTRING_PTR(rb_id);
+  id_len = RSTRING_LEN(rb_id);
+  trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
+  if(!U_SUCCESS(status)) {
+    rb_raise(rb_eArgError, "%s", u_errorName(status));
+  }
+  u_txt = new icu::UnicodeString(txt, txt_len);
+  trans->transliterate(*u_txt);
+  icu::StringByteSink<std::string> sink(&result);
+  u_txt->toUTF8(sink);
+  delete u_txt;
+  delete trans;
+  rb_out = charlock_new_str(result.data(), result.length());
+  return rb_out;
+}
+void _init_charlock_transliterator() {
+#ifdef HAVE_RUBY_ENCODING_H
+  rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
+#endif
+  rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
+  rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
+  rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
+}
+}

data/lib/charlock_holmes/encoding_detector.rb ADDED Viewed

@@ -0,0 +1,76 @@
+module CharlockHolmes
+  class EncodingDetector
+    # Default length for which to scan content for NULL bytes
+    DEFAULT_BINARY_SCAN_LEN = 1024*1024
+    # Length for which to scan content for NULL bytes
+    attr_accessor :binary_scan_length
+    alias :strip_tags? :strip_tags
+    def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
+      @binary_scan_length = scan_len
+    end
+    # Attempt to detect the encoding of this string
+    #
+    # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
+    # as well as use the default binary scan length
+    #
+    # str      - a String, what you want to detect the encoding of
+    # hint_enc - an optional String (like "UTF-8"), the encoding name which will
+    #            be used as an additional hint to the charset detector
+    #
+    # Returns: a Hash with :encoding, :language, :type and :confidence
+    def self.detect(str, hint_enc=nil)
+      new.detect(str, hint_enc)
+    end
+    # Attempt to detect the encoding of this string, and return
+    # a list with all the possible encodings that match it.
+    #
+    # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
+    # as well as use the default binary scan length
+    #
+    # str      - a String, what you want to detect the encoding of
+    # hint_enc - an optional String (like "UTF-8"), the encoding name which will
+    #            be used as an additional hint to the charset detector
+    #
+    # Returns: an Array with zero or more Hashes,
+    # each one of them with with :encoding, :language, :type and :confidence
+    def self.detect_all(str, hint_enc=nil)
+      new.detect_all(str, hint_enc)
+    end
+    # A mapping table of supported encoding names from EncodingDetector
+    # which point to the corresponding supported encoding name in Ruby.
+    # Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
+    #
+    # Note that encodings that can't be mapped between Charlock and Ruby will resolve
+    # to "ASCII-8BIT".
+    @encoding_table = {}
+    def self.encoding_table
+      @encoding_table
+    end
+    BINARY = 'binary'
+    # Builds the ENCODING_TABLE hash by running through the list of supported encodings
+    # in the ICU detection API and trying to map them to supported encodings in Ruby.
+    # This is built dynamically so as to take advantage of ICU upgrades which may have
+    # support for more encodings in the future.
+    #
+    # Returns nothing.
+    def self.build_encoding_table
+      supported_encodings.each do |name|
+        @encoding_table[name] = begin
+          ::Encoding.find(name).name
+        rescue ArgumentError
+          BINARY
+        end
+      end
+    end
+    build_encoding_table
+  end
+end

data/lib/charlock_holmes/string.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'charlock_holmes' unless defined? CharlockHolmes
+class String
+  # Attempt to detect the encoding of this string
+  #
+  # Returns: a Hash with :encoding, :language, :type and :confidence
+  def detect_encoding(hint_enc=nil)
+    detector = CharlockHolmes::EncodingDetector.new
+    detector.detect(self, hint_enc)
+  end
+  # Attempt to detect the encoding of this string, and return
+  # a list with all the possible encodings that match it.
+  #
+  # Returns: an Array with zero or more Hashes,
+  #          each one of them with with :encoding, :language, :type and :confidence
+  def detect_encodings(hint_enc=nil)
+    detector = CharlockHolmes::EncodingDetector.new
+    detector.detect_all(self, hint_enc)
+  end
+  if method_defined? :force_encoding
+    # Attempt to detect the encoding of this string
+    # then set the encoding to what was detected ala `force_encoding`
+    #
+    # Returns: self
+    def detect_encoding!(hint_enc=nil)
+      if detected = self.detect_encoding(hint_enc)
+        self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
+      end
+      self
+    end
+  end
+end

data/lib/charlock_holmes/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module CharlockHolmes
+  VERSION = "0.7.7"
+end

data/lib/charlock_holmes.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'charlock_holmes/charlock_holmes'
+require 'charlock_holmes/encoding_detector'
+require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
+# require this if you want the String monkey patches
+# require 'charlock_holmes/string'

metadata ADDED Viewed

@@ -0,0 +1,98 @@
+--- !ruby/object:Gem::Specification
+name: static_holmes
+version: !ruby/object:Gem::Version
+  version: 0.7.7
+platform: ruby
+authors:
+- Brian Lopez
+- Vicent Martí
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2024-03-23 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.11'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.11'
+- !ruby/object:Gem::Dependency
+  name: chardet
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.9'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.9'
+description: charlock_holmes provides binary and text detection as well as text transcoding
+  using libicu
+email: seniorlopez@gmail.com
+executables: []
+extensions:
+- ext/charlock_holmes/extconf.rb
+extra_rdoc_files: []
+files:
+- ext/charlock_holmes/common.h
+- ext/charlock_holmes/converter.c
+- ext/charlock_holmes/encoding_detector.c
+- ext/charlock_holmes/ext.c
+- ext/charlock_holmes/extconf.rb
+- ext/charlock_holmes/transliterator.cpp
+- lib/charlock_holmes.rb
+- lib/charlock_holmes/encoding_detector.rb
+- lib/charlock_holmes/string.rb
+- lib/charlock_holmes/version.rb
+homepage: https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options:
+- "--charset=UTF-8"
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.9.3
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.6
+signing_key:
+specification_version: 4
+summary: Character encoding detection, brought to you by ICU
+test_files: []