charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsCharSetProber_h__
|
|
22
|
+
#define nsCharSetProber_h__
|
|
23
|
+
|
|
24
|
+
#include "types.h"
|
|
25
|
+
|
|
26
|
+
typedef enum {
|
|
27
|
+
eDetecting = 0, //we are still detecting, no sure answer yet, but caller can ask for confidence.
|
|
28
|
+
eFoundIt = 1, //That's a positive answer
|
|
29
|
+
eNotMe = 2 //negative answer
|
|
30
|
+
} nsProbingState;
|
|
31
|
+
|
|
32
|
+
#define SHORTCUT_THRESHOLD (float)0.95
|
|
33
|
+
|
|
34
|
+
class nsCharSetProber {
|
|
35
|
+
public:
|
|
36
|
+
virtual ~nsCharSetProber() {};
|
|
37
|
+
virtual const char* GetCharSetName() = 0;
|
|
38
|
+
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
|
39
|
+
virtual nsProbingState GetState(void) = 0;
|
|
40
|
+
virtual void Reset(void) = 0;
|
|
41
|
+
virtual float GetConfidence(void) = 0;
|
|
42
|
+
virtual void SetOpion() = 0;
|
|
43
|
+
|
|
44
|
+
#ifdef DEBUG_chardet
|
|
45
|
+
virtual void DumpStatus() {};
|
|
46
|
+
#endif
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
#endif /* nsCharSetProber_h__ */
|
|
50
|
+
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsCodingStateMachine_h__
|
|
22
|
+
#define nsCodingStateMachine_h__
|
|
23
|
+
|
|
24
|
+
#include "pkgInt.h"
|
|
25
|
+
|
|
26
|
+
typedef enum {
|
|
27
|
+
eStart = 0,
|
|
28
|
+
eError = 1,
|
|
29
|
+
eItsMe = 2
|
|
30
|
+
} nsSMState;
|
|
31
|
+
|
|
32
|
+
#define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable)
|
|
33
|
+
|
|
34
|
+
//state machine model
|
|
35
|
+
typedef struct
|
|
36
|
+
{
|
|
37
|
+
nsPkgInt classTable;
|
|
38
|
+
PRUint32 classFactor;
|
|
39
|
+
nsPkgInt stateTable;
|
|
40
|
+
const PRUint32* charLenTable;
|
|
41
|
+
const char* name;
|
|
42
|
+
} SMModel;
|
|
43
|
+
|
|
44
|
+
class nsCodingStateMachine {
|
|
45
|
+
public:
|
|
46
|
+
nsCodingStateMachine(SMModel* sm){
|
|
47
|
+
mCurrentState = eStart;
|
|
48
|
+
mModel = sm;
|
|
49
|
+
};
|
|
50
|
+
nsSMState NextState(char c){
|
|
51
|
+
//for each byte we get its class , if it is first byte, we also get byte length
|
|
52
|
+
PRUint32 byteCls = GETCLASS(c);
|
|
53
|
+
if (mCurrentState == eStart)
|
|
54
|
+
{
|
|
55
|
+
mCurrentBytePos = 0;
|
|
56
|
+
mCurrentCharLen = mModel->charLenTable[byteCls];
|
|
57
|
+
}
|
|
58
|
+
//from byte's class and stateTable, we get its next state
|
|
59
|
+
mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls,
|
|
60
|
+
mModel->stateTable);
|
|
61
|
+
mCurrentBytePos++;
|
|
62
|
+
return mCurrentState;
|
|
63
|
+
};
|
|
64
|
+
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;};
|
|
65
|
+
void Reset(void) {mCurrentState = eStart;};
|
|
66
|
+
const char * GetCodingStateMachine() {return mModel->name;};
|
|
67
|
+
|
|
68
|
+
protected:
|
|
69
|
+
nsSMState mCurrentState;
|
|
70
|
+
PRUint32 mCurrentCharLen;
|
|
71
|
+
PRUint32 mCurrentBytePos;
|
|
72
|
+
|
|
73
|
+
SMModel *mModel;
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
extern SMModel UTF8SMModel;
|
|
77
|
+
extern SMModel Big5SMModel;
|
|
78
|
+
extern SMModel EUCJPSMModel;
|
|
79
|
+
extern SMModel EUCKRSMModel;
|
|
80
|
+
extern SMModel EUCTWSMModel;
|
|
81
|
+
extern SMModel GB18030SMModel;
|
|
82
|
+
extern SMModel SJISSMModel;
|
|
83
|
+
extern SMModel UCS2BESMModel;
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
extern SMModel HZSMModel;
|
|
87
|
+
extern SMModel ISO2022CNSMModel;
|
|
88
|
+
extern SMModel ISO2022JPSMModel;
|
|
89
|
+
extern SMModel ISO2022KRSMModel;
|
|
90
|
+
|
|
91
|
+
#endif /* nsCodingStateMachine_h__ */
|
|
92
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/* config.h. Generated by configure. */
|
|
2
|
+
/* config.h.in. Generated from configure.in by autoheader. */
|
|
3
|
+
|
|
4
|
+
/* Name of package */
|
|
5
|
+
#define PACKAGE "libcharguess"
|
|
6
|
+
|
|
7
|
+
/* Define to the address where bug reports for this package should be sent. */
|
|
8
|
+
#define PACKAGE_BUGREPORT ""
|
|
9
|
+
|
|
10
|
+
/* Define to the full name of this package. */
|
|
11
|
+
#define PACKAGE_NAME ""
|
|
12
|
+
|
|
13
|
+
/* Define which directory should be used for package source. */
|
|
14
|
+
#define PACKAGE_SOURCE_DIR "/Users/hydrus/charguess/ext/libcharguess/cpp"
|
|
15
|
+
|
|
16
|
+
/* Define to the full name and version of this package. */
|
|
17
|
+
#define PACKAGE_STRING ""
|
|
18
|
+
|
|
19
|
+
/* Define to the one symbol short name of this package. */
|
|
20
|
+
#define PACKAGE_TARNAME ""
|
|
21
|
+
|
|
22
|
+
/* Define to the version of this package. */
|
|
23
|
+
#define PACKAGE_VERSION ""
|
|
24
|
+
|
|
25
|
+
/* Define to 1 if you have the ANSI C header files. */
|
|
26
|
+
#define STDC_HEADERS 1
|
|
27
|
+
|
|
28
|
+
/* Version number of package */
|
|
29
|
+
#define VERSION "1.0b"
|
|
30
|
+
|
|
31
|
+
/* Define to empty if `const' does not conform to ANSI C. */
|
|
32
|
+
/* #undef const */
|
|
33
|
+
|
|
34
|
+
/* Define as `__inline' if that's what the C compiler calls it, or to nothing
|
|
35
|
+
if it is not supported. */
|
|
36
|
+
/* #undef inline */
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/* config.h.in. Generated from configure.in by autoheader. */
|
|
2
|
+
|
|
3
|
+
/* Name of package */
|
|
4
|
+
#undef PACKAGE
|
|
5
|
+
|
|
6
|
+
/* Define to the address where bug reports for this package should be sent. */
|
|
7
|
+
#undef PACKAGE_BUGREPORT
|
|
8
|
+
|
|
9
|
+
/* Define to the full name of this package. */
|
|
10
|
+
#undef PACKAGE_NAME
|
|
11
|
+
|
|
12
|
+
/* Define which directory should be used for package source. */
|
|
13
|
+
#undef PACKAGE_SOURCE_DIR
|
|
14
|
+
|
|
15
|
+
/* Define to the full name and version of this package. */
|
|
16
|
+
#undef PACKAGE_STRING
|
|
17
|
+
|
|
18
|
+
/* Define to the one symbol short name of this package. */
|
|
19
|
+
#undef PACKAGE_TARNAME
|
|
20
|
+
|
|
21
|
+
/* Define to the version of this package. */
|
|
22
|
+
#undef PACKAGE_VERSION
|
|
23
|
+
|
|
24
|
+
/* Define to 1 if you have the ANSI C header files. */
|
|
25
|
+
#undef STDC_HEADERS
|
|
26
|
+
|
|
27
|
+
/* Version number of package */
|
|
28
|
+
#undef VERSION
|
|
29
|
+
|
|
30
|
+
/* Define to empty if `const' does not conform to ANSI C. */
|
|
31
|
+
#undef const
|
|
32
|
+
|
|
33
|
+
/* Define as `__inline' if that's what the C compiler calls it, or to nothing
|
|
34
|
+
if it is not supported. */
|
|
35
|
+
#undef inline
|