charguess 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,50 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsCharSetProber_h__
22
+ #define nsCharSetProber_h__
23
+
24
+ #include "types.h"
25
+
26
+ typedef enum {
27
+ eDetecting = 0, //we are still detecting, no sure answer yet, but caller can ask for confidence.
28
+ eFoundIt = 1, //That's a positive answer
29
+ eNotMe = 2 //negative answer
30
+ } nsProbingState;
31
+
32
+ #define SHORTCUT_THRESHOLD (float)0.95
33
+
34
+ class nsCharSetProber {
35
+ public:
36
+ virtual ~nsCharSetProber() {};
37
+ virtual const char* GetCharSetName() = 0;
38
+ virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
39
+ virtual nsProbingState GetState(void) = 0;
40
+ virtual void Reset(void) = 0;
41
+ virtual float GetConfidence(void) = 0;
42
+ virtual void SetOpion() = 0;
43
+
44
+ #ifdef DEBUG_chardet
45
+ virtual void DumpStatus() {};
46
+ #endif
47
+ };
48
+
49
+ #endif /* nsCharSetProber_h__ */
50
+
@@ -0,0 +1,92 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsCodingStateMachine_h__
22
+ #define nsCodingStateMachine_h__
23
+
24
+ #include "pkgInt.h"
25
+
26
+ typedef enum {
27
+ eStart = 0,
28
+ eError = 1,
29
+ eItsMe = 2
30
+ } nsSMState;
31
+
32
+ #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable)
33
+
34
+ //state machine model
35
+ typedef struct
36
+ {
37
+ nsPkgInt classTable;
38
+ PRUint32 classFactor;
39
+ nsPkgInt stateTable;
40
+ const PRUint32* charLenTable;
41
+ const char* name;
42
+ } SMModel;
43
+
44
+ class nsCodingStateMachine {
45
+ public:
46
+ nsCodingStateMachine(SMModel* sm){
47
+ mCurrentState = eStart;
48
+ mModel = sm;
49
+ };
50
+ nsSMState NextState(char c){
51
+ //for each byte we get its class , if it is first byte, we also get byte length
52
+ PRUint32 byteCls = GETCLASS(c);
53
+ if (mCurrentState == eStart)
54
+ {
55
+ mCurrentBytePos = 0;
56
+ mCurrentCharLen = mModel->charLenTable[byteCls];
57
+ }
58
+ //from byte's class and stateTable, we get its next state
59
+ mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls,
60
+ mModel->stateTable);
61
+ mCurrentBytePos++;
62
+ return mCurrentState;
63
+ };
64
+ PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;};
65
+ void Reset(void) {mCurrentState = eStart;};
66
+ const char * GetCodingStateMachine() {return mModel->name;};
67
+
68
+ protected:
69
+ nsSMState mCurrentState;
70
+ PRUint32 mCurrentCharLen;
71
+ PRUint32 mCurrentBytePos;
72
+
73
+ SMModel *mModel;
74
+ };
75
+
76
+ extern SMModel UTF8SMModel;
77
+ extern SMModel Big5SMModel;
78
+ extern SMModel EUCJPSMModel;
79
+ extern SMModel EUCKRSMModel;
80
+ extern SMModel EUCTWSMModel;
81
+ extern SMModel GB18030SMModel;
82
+ extern SMModel SJISSMModel;
83
+ extern SMModel UCS2BESMModel;
84
+
85
+
86
+ extern SMModel HZSMModel;
87
+ extern SMModel ISO2022CNSMModel;
88
+ extern SMModel ISO2022JPSMModel;
89
+ extern SMModel ISO2022KRSMModel;
90
+
91
+ #endif /* nsCodingStateMachine_h__ */
92
+
@@ -0,0 +1,36 @@
1
+ /* config.h. Generated by configure. */
2
+ /* config.h.in. Generated from configure.in by autoheader. */
3
+
4
+ /* Name of package */
5
+ #define PACKAGE "libcharguess"
6
+
7
+ /* Define to the address where bug reports for this package should be sent. */
8
+ #define PACKAGE_BUGREPORT ""
9
+
10
+ /* Define to the full name of this package. */
11
+ #define PACKAGE_NAME ""
12
+
13
+ /* Define which directory should be used for package source. */
14
+ #define PACKAGE_SOURCE_DIR "/Users/hydrus/charguess/ext/libcharguess/cpp"
15
+
16
+ /* Define to the full name and version of this package. */
17
+ #define PACKAGE_STRING ""
18
+
19
+ /* Define to the one symbol short name of this package. */
20
+ #define PACKAGE_TARNAME ""
21
+
22
+ /* Define to the version of this package. */
23
+ #define PACKAGE_VERSION ""
24
+
25
+ /* Define to 1 if you have the ANSI C header files. */
26
+ #define STDC_HEADERS 1
27
+
28
+ /* Version number of package */
29
+ #define VERSION "1.0b"
30
+
31
+ /* Define to empty if `const' does not conform to ANSI C. */
32
+ /* #undef const */
33
+
34
+ /* Define as `__inline' if that's what the C compiler calls it, or to nothing
35
+ if it is not supported. */
36
+ /* #undef inline */
@@ -0,0 +1,35 @@
1
+ /* config.h.in. Generated from configure.in by autoheader. */
2
+
3
+ /* Name of package */
4
+ #undef PACKAGE
5
+
6
+ /* Define to the address where bug reports for this package should be sent. */
7
+ #undef PACKAGE_BUGREPORT
8
+
9
+ /* Define to the full name of this package. */
10
+ #undef PACKAGE_NAME
11
+
12
+ /* Define which directory should be used for package source. */
13
+ #undef PACKAGE_SOURCE_DIR
14
+
15
+ /* Define to the full name and version of this package. */
16
+ #undef PACKAGE_STRING
17
+
18
+ /* Define to the one symbol short name of this package. */
19
+ #undef PACKAGE_TARNAME
20
+
21
+ /* Define to the version of this package. */
22
+ #undef PACKAGE_VERSION
23
+
24
+ /* Define to 1 if you have the ANSI C header files. */
25
+ #undef STDC_HEADERS
26
+
27
+ /* Version number of package */
28
+ #undef VERSION
29
+
30
+ /* Define to empty if `const' does not conform to ANSI C. */
31
+ #undef const
32
+
33
+ /* Define as `__inline' if that's what the C compiler calls it, or to nothing
34
+ if it is not supported. */
35
+ #undef inline