charlock_holmes-jruby 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ -Ilib
2
+ --color
3
+ --format doc
4
+ spec
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
@@ -0,0 +1,29 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ charlock_holmes-jruby (0.1.0-java)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.2.1)
10
+ rake (10.0.4)
11
+ rake-compiler (0.8.3)
12
+ rake
13
+ rspec (2.13.0)
14
+ rspec-core (~> 2.13.0)
15
+ rspec-expectations (~> 2.13.0)
16
+ rspec-mocks (~> 2.13.0)
17
+ rspec-core (2.13.1)
18
+ rspec-expectations (2.13.0)
19
+ diff-lcs (>= 1.1.3, < 2.0)
20
+ rspec-mocks (2.13.0)
21
+
22
+ PLATFORMS
23
+ java
24
+
25
+ DEPENDENCIES
26
+ charlock_holmes-jruby!
27
+ rake
28
+ rake-compiler (>= 0.7.5)
29
+ rspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Francis Chong francis@ignition.hk
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # CharlockHolmes for JRuby
2
+
3
+ Character encoding detecting library for JRuby using [ICU4J](http://site.icu-project.org/).
4
+
5
+ ## Requirements
6
+
7
+ - JRuby in 1.9 mode (or above)
8
+
9
+ ## Usage
10
+
11
+ First you'll need to require it
12
+
13
+ ```
14
+ require 'charlock_holmes'
15
+ ```
16
+
17
+ ## Encoding detection
18
+
19
+ ```ruby
20
+ contents = File.read('test.xml')
21
+ detection = CharlockHolmes::EncodingDetector.detect(contents)
22
+ # => {:encoding => 'UTF-8', :confidence => 100}
23
+
24
+ # optionally there will be a :language key as well, but
25
+ # that's mostly only returned for legacy encodings like ISO-8859-1
26
+ ```
27
+
28
+ NOTE: ```CharlockHolmes::EncodingDetector.detect``` will return nil if it was unable to find an encoding.
29
+
30
+ ## String monkey patch
31
+
32
+ Alternatively, you can just use the detect_encoding method on the String class
33
+
34
+ ```
35
+ require 'charlock_holmes/string'
36
+
37
+ contents = File.read('test.xml')
38
+
39
+ detection = contents.detect_encoding
40
+ # => {:encoding => 'UTF-8', :confidence => 100}
41
+
42
+ # this will detect and set the encoding of `contents`, then return self
43
+ contents.detect_encoding!
44
+ ```
45
+
46
+ ## Transcoding
47
+
48
+ Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
49
+
50
+ ```
51
+
52
+ content = File.read('test2.txt')
53
+ detection = CharlockHolmes::EncodingDetector.detect(content)
54
+ utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
55
+ ```
56
+
57
+ The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
58
+
59
+ ## Installing
60
+
61
+ ```
62
+ gem install charlock_holmes_jruby
63
+ ```
64
+
65
+
@@ -0,0 +1,17 @@
1
+ $LOAD_PATH << 'lib'
2
+
3
+ require 'rake/testtask'
4
+ require 'charlock_holmes/version'
5
+
6
+ task :package do
7
+ version_string = "v#{CharlockHolmes::VERSION}"
8
+ system %(gem build charlock_holmes-jruby.gemspec)
9
+ end
10
+
11
+ task :release => :package do
12
+ version_string = "v#{CharlockHolmes::VERSION}"
13
+ unless %x(git tag -l).include?(version_string)
14
+ system %(git tag -a #{version_string} -m #{version_string})
15
+ end
16
+ system %(gem push charlock_holmes-jruby-*.gem && mv charlock_holmes-jruby-*.gem pkg)
17
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = %q{charlock_holmes-jruby}
7
+ s.version = CharlockHolmes::VERSION
8
+ s.platform = 'java'
9
+ s.authors = ["Francis Chong"]
10
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
11
+ s.email = %q{francis@ignition.hk}
12
+
13
+ s.files = `git ls-files`.split("\n")
14
+ s.homepage = %q{http://github.com/siuying/charlock_holmes-jruby}
15
+ s.rdoc_options = ["--charset=UTF-8"]
16
+ s.require_paths = ["lib"]
17
+ s.rubygems_version = %q{1.4.2}
18
+ s.summary = %q{Character encoding detection, brought to you by ICU.}
19
+ s.description = %q{JRuby compatible ICU encoding detection build on top of ICU4J.}
20
+ s.test_files = `git ls-files spec`.split("\n")
21
+
22
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
23
+ s.add_development_dependency 'rspec'
24
+ s.add_development_dependency 'rake'
25
+ end
@@ -0,0 +1 @@
1
+ require 'charlock_holmes'
@@ -0,0 +1,5 @@
1
+ require 'charlock_holmes/version'
2
+ require CharlockHolmes::ICU_JAR_FILENAME
3
+
4
+ require 'charlock_holmes/encoding_detector'
5
+ require 'charlock_holmes/converter'
@@ -0,0 +1,17 @@
1
+ require_relative 'version'
2
+ require_relative "../#{CharlockHolmes::ICU_JAR_FILENAME}"
3
+ java_import 'com.ibm.icu.text.CharsetMatch'
4
+
5
+ module CharlockHolmes
6
+ module CharsetMatchExt
7
+ def to_hash
8
+ {
9
+ encoding: self.getName(),
10
+ confidence: self.getConfidence(),
11
+ language: self.getLanguage()
12
+ }
13
+ end
14
+ end
15
+ end
16
+
17
+ com::ibm::icu::text::CharsetMatch.send :include, CharlockHolmes::CharsetMatchExt
@@ -0,0 +1,12 @@
1
+ module CharlockHolmes
2
+ module Converter
3
+ extend self
4
+
5
+ def convert(string, from, to)
6
+ raise TypeError.new("string cannot be nil") unless string
7
+ raise TypeError.new("from cannot be nil") unless from
8
+ raise TypeError.new("to cannot be nil") unless to
9
+ string.encode(to, from)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,54 @@
1
+ require_relative 'version'
2
+ require_relative "../#{CharlockHolmes::ICU_JAR_FILENAME}"
3
+ require_relative 'charset_match_ext'
4
+
5
+ java_import 'com.ibm.icu.text.CharsetDetector'
6
+ java_import 'java.io.ByteArrayInputStream'
7
+
8
+ module CharlockHolmes
9
+ module EncodingDetector
10
+ extend self
11
+
12
+ def detect(string, hint=nil)
13
+ detector = create_detector(string, hint)
14
+
15
+ begin
16
+ if charset_match = detector.detect()
17
+ charset_match.to_hash
18
+ else
19
+ nil
20
+ end
21
+ rescue
22
+ nil
23
+ end
24
+ end
25
+
26
+ def detect_all(string, hint=nil)
27
+ detector = create_detector(string, hint)
28
+
29
+ begin
30
+ if charset_matchs = detector.detectAll()
31
+ charset_matchs.collect {|match| match.to_hash }
32
+ else
33
+ nil
34
+ end
35
+ rescue
36
+ nil
37
+ end
38
+ end
39
+
40
+ def all_detectable_charsets
41
+ CharsetDetector.getAllDetectableCharsets().to_a
42
+ end
43
+
44
+ private
45
+ def create_detector(string, hint=nil)
46
+ detector = CharsetDetector.new
47
+ detector.setText ByteArrayInputStream.new(string.to_java_bytes)
48
+ detector.setDeclaredEncoding(hint)
49
+ detector
50
+ end
51
+ end
52
+
53
+
54
+ end
@@ -0,0 +1,21 @@
1
+ require_relative 'encoding_detector'
2
+
3
+ module CharlockHolmes
4
+ module StringExt
5
+ def detect_encoding(hint=nil)
6
+ EncodingDetector.detect(self, hint)
7
+ end
8
+
9
+ def detect_encodings(hint=nil)
10
+ EncodingDetector.detect_all(self, hint)
11
+ end
12
+
13
+ def detect_encoding!(hint=nil)
14
+ detected = EncodingDetector.detect(self, hint)
15
+ self.force_encoding(detected[:encoding]) if detected[:encoding]
16
+ self
17
+ end
18
+ end
19
+ end
20
+
21
+ String.send :include, CharlockHolmes::StringExt
@@ -0,0 +1,5 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.1.0"
3
+ ICU_VERSION = "51.1"
4
+ ICU_JAR_FILENAME = "icu4j-51_1.jar"
5
+ end
@@ -0,0 +1 @@
1
+ require 'charlock_holmes'
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helpers'
4
+ require 'charlock_holmes'
5
+
6
+ describe CharlockHolmes::Converter do
7
+ subject { CharlockHolmes::Converter }
8
+
9
+ describe "#convert" do
10
+ it "should convert ascii from iso8859-1 to utf-16 and back" do
11
+ input = 'test'
12
+
13
+ output = subject.convert input, 'ISO-8859-1', 'UTF-16'
14
+ output.encoding.to_s.should == "UTF-16"
15
+ input.bytesize.should < output.bytesize
16
+ input.should_not == output
17
+
18
+ output = subject.convert output, 'UTF-16', 'ISO-8859-1'
19
+ output.encoding.to_s.should == "ISO-8859-1"
20
+ input.bytesize.should == output.bytesize
21
+ input.should == output
22
+ end
23
+
24
+ it "should convert utf8 to utf16 and back" do
25
+ input = 'λ, λ, λ'
26
+
27
+ output = subject.convert input, 'UTF-8', 'UTF-16'
28
+ output.encoding.to_s.should == "UTF-16"
29
+ input.bytesize.should < output.bytesize
30
+ input.should_not == output
31
+
32
+ output = subject.convert output, 'UTF-16', 'UTF-8'
33
+ output.encoding.to_s.should == "UTF-8"
34
+ input.bytesize.should == output.bytesize
35
+ input.should == output
36
+ end
37
+
38
+ it "should raise error if params are not string" do
39
+ expect {
40
+ subject.convert nil, 'UTF-8', 'UTF-16'
41
+ }.to raise_error(TypeError)
42
+
43
+ expect {
44
+ subject.convert 'lol', 'UTF-8', nil
45
+ }.to raise_error(TypeError)
46
+
47
+ expect {
48
+ subject.convert 'lol', nil, 'UTF-16'
49
+ }.to raise_error(TypeError)
50
+
51
+ expect {
52
+ subject.convert 'lol', 'UTF-8', 'UTF-8'
53
+ }.to_not raise_error(TypeError)
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helpers'
4
+ require 'charlock_holmes'
5
+
6
+ describe CharlockHolmes::EncodingDetector do
7
+ subject { CharlockHolmes::EncodingDetector }
8
+
9
+ describe "#detect" do
10
+ it "should detect encoding of string" do
11
+ detected = subject.detect 'hello'
12
+ detected[:encoding].should == 'ISO-8859-1'
13
+ end
14
+
15
+ it "should accept encoding hint" do
16
+ detected = subject.detect 'hello', 'UTF-8'
17
+ detected[:encoding].should == 'ISO-8859-1'
18
+ end
19
+
20
+ MAPPING = [
21
+ ['repl2.cljs', 'ISO-8859-1'],
22
+ ['core.rkt', 'UTF-8'],
23
+ ['cl-messagepack.lisp', 'ISO-8859-1'],
24
+ ['TwigExtensionsDate.es.yml', 'UTF-8'],
25
+ ['AnsiGraph.psm1', 'UTF-16LE'],
26
+ ['laholator.py', 'UTF-8'],
27
+ ['mingpao.html', 'Big5'],
28
+ ['shift_jis.html', 'Shift_JIS']
29
+ ]
30
+
31
+ MAPPING.each do |mapping|
32
+ file, encoding, type = mapping
33
+ it "should detect encoding of test file #{file}" do
34
+ path = File.expand_path "../fixtures/#{file}", __FILE__
35
+ content = File.read path
36
+ guessed = subject.detect content
37
+ guessed[:encoding].should == encoding
38
+ content.force_encoding guessed[:encoding]
39
+ content.valid_encoding?.should be_true
40
+ end
41
+ end
42
+ end
43
+
44
+ describe "#detect_all" do
45
+ it "should returns array of possible matches" do
46
+ detected_list = subject.detect_all 'test'
47
+ detected_list.should be_a(Array)
48
+
49
+ encoding_list = detected_list.map {|d| d[:encoding] }.sort
50
+ encoding_list.should == ['ISO-8859-1', 'ISO-8859-2', 'UTF-8']
51
+ end
52
+
53
+ it "should accept encoding hint" do
54
+ detected_list = subject.detect_all 'test', 'UTF-8'
55
+ detected_list.should be_a(Array)
56
+
57
+ encoding_list = detected_list.map {|d| d[:encoding] }.sort
58
+ encoding_list.should == ['ISO-8859-1', 'ISO-8859-2', 'UTF-8']
59
+ end
60
+ end
61
+
62
+ describe "#all_detectable_charsets" do
63
+ it "should return array of charsets" do
64
+ list = subject.all_detectable_charsets
65
+ list.should be_a(Array)
66
+ list.should == %w{
67
+ UTF-8
68
+ UTF-16BE
69
+ UTF-16LE
70
+ UTF-32BE
71
+ UTF-32LE
72
+ Shift_JIS
73
+ ISO-2022-JP
74
+ ISO-2022-CN
75
+ ISO-2022-KR
76
+ GB18030
77
+ EUC-JP
78
+ EUC-KR
79
+ Big5
80
+ ISO-8859-1
81
+ ISO-8859-2
82
+ ISO-8859-5
83
+ ISO-8859-6
84
+ ISO-8859-7
85
+ ISO-8859-8
86
+ windows-1251
87
+ windows-1256
88
+ KOI8-R
89
+ ISO-8859-9
90
+ IBM424_rtl
91
+ IBM424_ltr
92
+ IBM420_rtl
93
+ IBM420_ltr
94
+ }
95
+ end
96
+ end
97
+ end