RubyGems - charlock_holmes_heroku - Versions diffs - 0.6.13 - Mend

charlock_holmes_heroku 0.6.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +9 -0
data/.rspec +3 -0
data/Gemfile +3 -0
data/Gemfile.lock +30 -0
data/MIT-LICENSE +20 -0
data/README.md +111 -0
data/Rakefile +29 -0
data/benchmark/detection.rb +39 -0
data/benchmark/test.txt +693 -0
data/charlock_holmes.gemspec +25 -0
data/ext/charlock_holmes/common.h +41 -0
data/ext/charlock_holmes/converter.c +53 -0
data/ext/charlock_holmes/encoding_detector.c +295 -0
data/ext/charlock_holmes/ext.c +13 -0
data/ext/charlock_holmes/extconf.rb +86 -0
data/ext/charlock_holmes/src/icu4c-49_1_2-src.tgz +0 -0
data/ext/charlock_holmes/src/icu4c-52_1-src.tgz +0 -0
data/lib/charlock_holmes.rb +6 -0
data/lib/charlock_holmes/encoding_detector.rb +33 -0
data/lib/charlock_holmes/string.rb +34 -0
data/lib/charlock_holmes/version.rb +3 -0
data/spec/converter_spec.rb +29 -0
data/spec/encoding_detector_spec.rb +122 -0
data/spec/fixtures/AnsiGraph.psm1 +0 -0
data/spec/fixtures/TwigExtensionsDate.es.yml +8 -0
data/spec/fixtures/cl-messagepack.lisp +264 -0
data/spec/fixtures/core.rkt +254 -0
data/spec/fixtures/hello_world +0 -0
data/spec/fixtures/laholator.py +131 -0
data/spec/fixtures/repl2.cljs +109 -0
data/spec/spec_helper.rb +9 -0
data/spec/string_method_spec.rb +52 -0
metadata +133 -0

data/charlock_holmes.gemspec ADDED

@@ -0,0 +1,25 @@
+# encoding: utf-8
+require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
+Gem::Specification.new do |s|
+  s.name = %q{charlock_holmes_heroku}
+  s.version = CharlockHolmes::VERSION
+  s.authors = ["Brian Lopez", "Vicent Martí"]
+  s.date = Time.now.utc.strftime("%Y-%m-%d")
+  s.email = %q{seniorlopez@gmail.com}
+  s.extensions = ["ext/charlock_holmes/extconf.rb"]
+  s.files = `git ls-files`.split("\n")
+  s.homepage = %q{http://github.com/brianmario/charlock_holmes}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.4.2}
+  s.summary = %q{Character encoding detection, brought to you by ICU}
+  s.test_files = `git ls-files spec`.split("\n")
+  # tests
+  s.add_development_dependency 'rake-compiler', ">= 0.7.5"
+  s.add_development_dependency 'rspec', ">= 2.0.0"
+  # benchmarks
+  s.add_development_dependency 'chardet'
+end

data/ext/charlock_holmes/common.h ADDED

@@ -0,0 +1,41 @@
+#ifndef CHARLOCK_COMMON_H
+#define CHARLOCK_COMMON_H
+// tell rbx not to use it's caching compat layer
+// by doing this we're making a promize to RBX that
+// we'll never modify the pointers we get back from RSTRING_PTR
+#define RSTRING_NOT_MODIFIED
+#include <ruby.h>
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+#endif
+static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
+#else
+	return rb_str_new(str, len);
+#endif
+}
+static VALUE charlock_new_str(const char *str, size_t len)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
+#else
+	return rb_str_new(str, len);
+#endif
+}
+static VALUE charlock_new_str2(const char *str)
+{
+#ifdef HAVE_RUBY_ENCODING_H
+	return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
+#else
+	return rb_str_new2(str);
+#endif
+}
+#endif

data/ext/charlock_holmes/converter.c ADDED

@@ -0,0 +1,53 @@
+#include "unicode/ucnv.h"
+#include "common.h"
+extern VALUE rb_mCharlockHolmes;
+static VALUE rb_cConverter;
+static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
+	VALUE rb_out;
+	const char *src_enc;
+	const char *dst_enc;
+	const char *src_txt;
+	char *out_buf;
+	void *rb_enc = NULL;
+	int32_t src_len;
+	int32_t out_len;
+	UErrorCode status = U_ZERO_ERROR;
+	src_txt = RSTRING_PTR(rb_txt);
+	src_len = RSTRING_LEN(rb_txt);
+	src_enc = RSTRING_PTR(rb_src_enc);
+	dst_enc = RSTRING_PTR(rb_dst_enc);
+	// first determin the size of the output buffer
+	out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
+	if (status != U_BUFFER_OVERFLOW_ERROR) {
+		rb_raise(rb_eArgError, "%s", u_errorName(status));
+	}
+	out_buf = malloc(out_len);
+	// now do the actual conversion
+	status = U_ZERO_ERROR;
+	out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
+	if (U_FAILURE(status)) {
+		free(out_buf);
+		rb_raise(rb_eArgError, "%s", u_errorName(status));
+	}
+#ifdef HAVE_RUBY_ENCODING_H
+	rb_enc = (void *)rb_enc_find(dst_enc);
+#endif
+	rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
+	free(out_buf);
+	return rb_out;
+}
+void _init_charlock_converter() {
+	rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
+	rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
+}

data/ext/charlock_holmes/encoding_detector.c ADDED

@@ -0,0 +1,295 @@
+#include "unicode/ucsdet.h"
+#include "magic.h"
+#include "common.h"
+extern VALUE rb_mCharlockHolmes;
+static VALUE rb_cEncodingDetector;
+typedef struct {
+	UCharsetDetector *csd;
+	magic_t magic;
+} charlock_detector_t;
+static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	const char *mname;
+	const char *mlang;
+	int mconfidence;
+	VALUE rb_match;
+	if (!match)
+		return Qnil;
+	mname = ucsdet_getName(match, &status);
+	mlang = ucsdet_getLanguage(match, &status);
+	mconfidence = ucsdet_getConfidence(match, &status);
+	rb_match = rb_hash_new();
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
+	if (mlang && mlang[0])
+		rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
+	return rb_match;
+}
+static VALUE rb_encdec_binarymatch() {
+	VALUE rb_match;
+	rb_match = rb_hash_new();
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
+	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
+	return rb_match;
+}
+static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
+	const char *binary_result;
+	binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
+	if (binary_result) {
+		if (!strstr(binary_result, "text"))
+			return 1;
+	} else {
+		rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
+	}
+	return 0;
+}
+/*
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
+ *
+ * Attempt to detect the encoding of this string
+ *
+ * str      - a String, what you want to detect the encoding of
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
+ *            be used as an additional hint to the charset detector
+ *
+ * Returns: a Hash with :encoding, :language, :type and :confidence
+ */
+static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	charlock_detector_t *detector;
+	VALUE rb_str;
+	VALUE rb_enc_hint;
+	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
+	Check_Type(rb_str, T_STRING);
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	// first lets see if this is binary content
+	if (detect_binary_content(detector, rb_str)) {
+		return rb_encdec_binarymatch();
+	}
+	// if we got here - the data doesn't look like binary
+	// lets try to figure out what encoding the text is in
+	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
+	if (!NIL_P(rb_enc_hint)) {
+		Check_Type(rb_enc_hint, T_STRING);
+		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
+	}
+	return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
+}
+/*
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
+ *
+ * Attempt to detect the encoding of this string, and return
+ * a list with all the possible encodings that match it.
+ *
+ *
+ * str      - a String, what you want to detect the encoding of
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
+ *            be used as an additional hint to the charset detector
+ *
+ * Returns: an Array with zero or more Hashes,
+ *          each one of them with with :encoding, :language, :type and :confidence
+ */
+static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
+{
+	UErrorCode status = U_ZERO_ERROR;
+	charlock_detector_t *detector;
+	const UCharsetMatch **csm;
+	VALUE rb_ret;
+	int i, match_count;
+	VALUE rb_str;
+	VALUE rb_enc_hint;
+	VALUE binary_match;
+	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
+	Check_Type(rb_str, T_STRING);
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	rb_ret = rb_ary_new();
+	// first lets see if this is binary content
+	binary_match = Qnil;
+	if (detect_binary_content(detector, rb_str)) {
+		binary_match = rb_encdec_binarymatch();
+	}
+	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
+	if (!NIL_P(rb_enc_hint)) {
+		Check_Type(rb_enc_hint, T_STRING);
+		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
+	}
+	csm = ucsdet_detectAll(detector->csd, &match_count, &status);
+	for (i = 0; i < match_count; ++i) {
+		rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
+	}
+	if (!NIL_P(binary_match))
+		rb_ary_unshift(rb_ret, binary_match);
+	return rb_ret;
+}
+/*
+ * call-seq: EncodingDetector#strip_tags?
+ *
+ * Returns whether or not the strip_tags flag is set on this detector
+ *
+ * Returns: Boolean
+ */
+static VALUE rb_get_strip_tags(VALUE self)
+{
+	charlock_detector_t *detector;
+	UBool val;
+	VALUE rb_val;
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	val = ucsdet_isInputFilterEnabled(detector->csd);
+	rb_val = val == 1 ? Qtrue : Qfalse;
+	return rb_val;
+}
+/*
+ * call-seq: EncodingDetector#strip_tags = true
+ *
+ * Enable or disable the stripping of HTML/XML tags from the input before
+ * attempting any detection
+ *
+ * Returns: Boolean, the value passed
+ */
+static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
+{
+	charlock_detector_t *detector;
+	UBool val;
+	Data_Get_Struct(self, charlock_detector_t, detector);
+	val = rb_val == Qtrue ? 1 : 0;
+	ucsdet_enableInputFilter(detector->csd, val);
+	return rb_val;
+}
+/*
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
+ *
+ * The list of detectable encodings supported by this library
+ *
+ * Returns: an Array of Strings
+ */
+static VALUE rb_get_supported_encodings(VALUE klass)
+{
+	UCharsetDetector *csd;
+	UErrorCode status = U_ZERO_ERROR;
+	UEnumeration *encoding_list;
+	VALUE rb_encoding_list;
+	int32_t enc_count;
+	int32_t i;
+	const char *enc_name;
+	int32_t enc_name_len;
+	rb_encoding_list = rb_iv_get(klass, "encoding_list");
+	// lazily populate the list
+	if (NIL_P(rb_encoding_list)) {
+		csd = ucsdet_open(&status);
+		encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
+		rb_encoding_list = rb_ary_new();
+		enc_count = uenum_count(encoding_list, &status);
+		for(i=0; i < enc_count; i++) {
+			enc_name = uenum_next(encoding_list, &enc_name_len, &status);
+			rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
+		}
+		rb_iv_set(klass, "encoding_list", rb_encoding_list);
+		ucsdet_close(csd);
+	}
+	return rb_encoding_list;
+}
+static void rb_encdec__free(void *obj)
+{
+	charlock_detector_t *detector;
+	detector = (charlock_detector_t *)obj;
+	if (detector->csd)
+		ucsdet_close(detector->csd);
+	if (detector->magic)
+		magic_close(detector->magic);
+	free(detector);
+}
+static VALUE rb_encdec__alloc(VALUE klass)
+{
+	charlock_detector_t *detector;
+	UErrorCode status = U_ZERO_ERROR;
+	VALUE obj;
+	detector = calloc(1, sizeof(charlock_detector_t));
+	obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
+	detector->csd = ucsdet_open(&status);
+	if (U_FAILURE(status)) {
+		rb_raise(rb_eStandardError, "%s", u_errorName(status));
+	}
+	detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
+	if (detector->magic == NULL) {
+		rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
+	}
+	return obj;
+}
+void _init_charlock_encoding_detector()
+{
+	rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
+	rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
+	rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
+	rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
+	rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
+	rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
+	rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
+}

data/ext/charlock_holmes/ext.c ADDED

@@ -0,0 +1,13 @@
+#include "common.h"
+extern void _init_charlock_encoding_detector();
+extern void _init_charlock_converter();
+VALUE rb_mCharlockHolmes;
+void Init_charlock_holmes() {
+	rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
+	_init_charlock_encoding_detector();
+	_init_charlock_converter();
+}

data/ext/charlock_holmes/extconf.rb ADDED

@@ -0,0 +1,86 @@
+require 'mkmf'
+CWD = File.expand_path(File.dirname(__FILE__))
+def sys(cmd)
+  puts "  -- #{cmd}"
+  unless ret = xsystem(cmd)
+    raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes"
+  end
+  ret
+end
+if `which make`.strip.empty?
+  STDERR.puts "\n\n"
+  STDERR.puts "***************************************************************************************"
+  STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
+  STDERR.puts "***************************************************************************************"
+  exit(1)
+end
+##
+# ICU dependency
+#
+src = File.basename('icu4c-52_1-src.tgz')
+dir = File.basename('icu')
+Dir.chdir("#{CWD}/src") do
+  FileUtils.rm_rf(dir) if File.exists?(dir)
+  sys("tar zxvf #{src}")
+  Dir.chdir(File.join(dir, 'source')) do
+    sys("LDFLAGS= CXXFLAGS=\"-O2 -fPIC\" CFLAGS=\"-O2 -fPIC\" ./configure --prefix=#{CWD}/dst/ --disable-tests --disable-samples --disable-icuio --disable-extras --disable-layout --enable-static --disable-shared")
+    sys("make install")
+  end
+end
+dir_config 'icu'
+$INCFLAGS << " -I#{CWD}/dst/include "
+$LDFLAGS  << " -L#{CWD}/dst/lib"
+unless have_library 'icui18n' and have_library 'icudata' and have_library 'icutu' and have_library 'icuuc' and have_header 'unicode/ucnv.h'
+  STDERR.puts "\n\n"
+  STDERR.puts "***************************************************************************************"
+  STDERR.puts "********* error compiling and linking icu4c. please report issue on github *********"
+  STDERR.puts "***************************************************************************************"
+  exit(1)
+end
+##
+# libmagic dependency
+#
+src = File.basename('file-5.08.tar.gz')
+dir = File.basename(src, '.tar.gz')
+Dir.chdir("#{CWD}/src") do
+  FileUtils.rm_rf(dir) if File.exists?(dir)
+  sys("tar zxvf #{src}")
+  Dir.chdir(dir) do
+    sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
+    sys("patch -p0 < ../file-soft-check.patch")
+    sys("make -C src install")
+    sys("make -C magic install")
+  end
+end
+FileUtils.cp "#{CWD}/dst/lib/libmagic.a", "#{CWD}/libmagic_ext.a"
+$INCFLAGS[0,0] = " -I#{CWD}/dst/include "
+$LDFLAGS << " -L#{CWD} "
+dir_config 'magic'
+unless have_library 'magic_ext' and have_header 'magic.h'
+  STDERR.puts "\n\n"
+  STDERR.puts "***************************************************************************************"
+  STDERR.puts "********* error compiling and linking libmagic. please report issue on github *********"
+  STDERR.puts "***************************************************************************************"
+  exit(1)
+end
+$CFLAGS << ' -Wall -funroll-loops'
+$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
+$LIBS << " -lstdc++"
+create_makefile 'charlock_holmes/charlock_holmes'