chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/util/test.h ADDED
@@ -0,0 +1,57 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_UTIL_TEST_H__
6
+ #define RE2_UTIL_TEST_H__
7
+
8
+ #include "util/util.h"
9
+ #include "util/flags.h"
10
+
11
+ #define TEST(x, y) \
12
+ void x##y(void); \
13
+ TestRegisterer r##x##y(x##y, # x "." # y); \
14
+ void x##y(void)
15
+
16
+ void RegisterTest(void (*)(void), const char*);
17
+
18
+ class TestRegisterer {
19
+ public:
20
+ TestRegisterer(void (*fn)(void), const char *s) {
21
+ RegisterTest(fn, s);
22
+ }
23
+ };
24
+
25
+ // TODO(rsc): Do a better job.
26
+ #define EXPECT_EQ CHECK_EQ
27
+ #define EXPECT_TRUE CHECK
28
+ #define EXPECT_LT CHECK_LT
29
+ #define EXPECT_GT CHECK_GT
30
+ #define EXPECT_LE CHECK_LE
31
+ #define EXPECT_GE CHECK_GE
32
+ #define EXPECT_FALSE(x) CHECK(!(x))
33
+
34
+ #define ARRAYSIZE arraysize
35
+
36
+ #define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
37
+ #define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
38
+ #define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
39
+ #define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
40
+
41
+ const bool UsingMallocCounter = false;
42
+ namespace testing {
43
+ class MallocCounter {
44
+ public:
45
+ MallocCounter(int x) { }
46
+ static const int THIS_THREAD_ONLY = 0;
47
+ long long HeapGrowth() { return 0; }
48
+ long long PeakHeapGrowth() { return 0; }
49
+ void Reset() { }
50
+ };
51
+ } // namespace testing
52
+
53
+ namespace re2 {
54
+ int64 VirtualProcessSize();
55
+ } // namespace re2
56
+
57
+ #endif // RE2_UTIL_TEST_H__
data/ext/util/thread.h ADDED
@@ -0,0 +1,26 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_UTIL_THREAD_H__
6
+ #define RE2_UTIL_THREAD_H__
7
+
8
+ #include <pthread.h>
9
+
10
+ class Thread {
11
+ public:
12
+ Thread();
13
+ virtual ~Thread();
14
+ void Start();
15
+ void Join();
16
+ void SetJoinable(bool);
17
+ virtual void Run() = 0;
18
+
19
+ private:
20
+ pthread_t pid_;
21
+ bool running_;
22
+ bool joinable_;
23
+ };
24
+
25
+ #endif // RE2_UTIL_THREAD_H__
26
+
data/ext/util/utf.h ADDED
@@ -0,0 +1,43 @@
1
+ /*
2
+ * The authors of this software are Rob Pike and Ken Thompson.
3
+ * Copyright (c) 2002 by Lucent Technologies.
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose without fee is hereby granted, provided that this entire notice
6
+ * is included in all copies of any software which is or includes a copy
7
+ * or modification of this software and in all copies of the supporting
8
+ * documentation for such software.
9
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
+ *
14
+ * This file and rune.cc have been converted to compile as C++ code
15
+ * in name space re2.
16
+ */
17
+ #ifndef RE2_UTIL_UTF_H__
18
+ #define RE2_UTIL_UTF_H__
19
+
20
+ #include <stdint.h>
21
+
22
+ namespace re2 {
23
+
24
+ typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
25
+
26
+ enum
27
+ {
28
+ UTFmax = 4, /* maximum bytes per rune */
29
+ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
30
+ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
31
+ Runeerror = 0xFFFD, /* decoding error in UTF */
32
+ Runemax = 0x10FFFF, /* maximum rune value */
33
+ };
34
+
35
+ int runetochar(char* s, const Rune* r);
36
+ int chartorune(Rune* r, const char* s);
37
+ int fullrune(const char* s, int n);
38
+ int utflen(const char* s);
39
+ char* utfrune(const char*, Rune);
40
+
41
+ } // namespace re2
42
+
43
+ #endif // RE2_UTIL_UTF_H__
data/ext/util/util.h ADDED
@@ -0,0 +1,127 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_UTIL_UTIL_H__
6
+ #define RE2_UTIL_UTIL_H__
7
+
8
+ // C
9
+ #include <stdio.h>
10
+ #include <string.h>
11
+ #include <stdint.h>
12
+ #include <stddef.h> // For size_t
13
+ #include <assert.h>
14
+ #include <stdarg.h>
15
+ #include <sys/time.h>
16
+ #include <time.h>
17
+
18
+ // C++
19
+ #include <vector>
20
+ #include <string>
21
+ #include <algorithm>
22
+ #include <iosfwd>
23
+ #include <map>
24
+ #include <stack>
25
+ #include <iostream>
26
+ #include <utility>
27
+ #include <set>
28
+
29
+ // Use std names.
30
+ using std::set;
31
+ using std::pair;
32
+ using std::vector;
33
+ using std::string;
34
+ using std::min;
35
+ using std::max;
36
+ using std::ostream;
37
+ using std::map;
38
+ using std::stack;
39
+ using std::sort;
40
+ using std::swap;
41
+ using std::make_pair;
42
+
43
+ #if defined(__GNUC__) && !defined(USE_CXX0X)
44
+
45
+ #include <tr1/unordered_set>
46
+ using std::tr1::unordered_set;
47
+
48
+ #else
49
+
50
+ #include <unordered_set>
51
+ using std::unordered_set;
52
+
53
+ #endif
54
+
55
+ namespace re2 {
56
+
57
+ typedef int8_t int8;
58
+ typedef uint8_t uint8;
59
+ typedef int16_t int16;
60
+ typedef uint16_t uint16;
61
+ typedef int32_t int32;
62
+ typedef uint32_t uint32;
63
+ typedef int64_t int64;
64
+ typedef uint64_t uint64;
65
+
66
+ typedef unsigned long ulong;
67
+ typedef unsigned int uint;
68
+ typedef unsigned short ushort;
69
+
70
+ // COMPILE_ASSERT causes a compile error about msg if expr is not true.
71
+ template<bool> struct CompileAssert {};
72
+ #define COMPILE_ASSERT(expr, msg) \
73
+ typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
74
+
75
+ // DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
76
+ // It goes in the private: declarations in a class.
77
+ #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
78
+ TypeName(const TypeName&); \
79
+ void operator=(const TypeName&)
80
+
81
+ #define arraysize(array) (sizeof(array)/sizeof((array)[0]))
82
+
83
+ // Fake lock annotations. For real ones, see
84
+ // http://code.google.com/p/data-race-test/
85
+ #define ANNOTATE_PUBLISH_MEMORY_RANGE(a, b)
86
+ #define ANNOTATE_IGNORE_WRITES_BEGIN()
87
+ #define ANNOTATE_IGNORE_WRITES_END()
88
+ #define ANNOTATE_BENIGN_RACE(a, b)
89
+ #define NO_THREAD_SAFETY_ANALYSIS
90
+ #define ANNOTATE_HAPPENS_BEFORE(x)
91
+ #define ANNOTATE_HAPPENS_AFTER(x)
92
+
93
+ class StringPiece;
94
+
95
+ string CEscape(const StringPiece& src);
96
+ int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
97
+
98
+ extern string StringPrintf(const char* format, ...);
99
+ extern void SStringPrintf(string* dst, const char* format, ...);
100
+ extern void StringAppendF(string* dst, const char* format, ...);
101
+ extern string PrefixSuccessor(const StringPiece& prefix);
102
+
103
+ uint32 hashword(const uint32*, size_t, uint32);
104
+ void hashword2(const uint32*, size_t, uint32*, uint32*);
105
+
106
+ static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
107
+ return hashword((uint32*)s, len/4, seed);
108
+ }
109
+
110
+ static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
111
+ uint32 x, y;
112
+ x = seed;
113
+ y = 0;
114
+ hashword2((uint32*)s, len/4, &x, &y);
115
+ return ((uint64)x << 32) | y;
116
+ }
117
+
118
+ int RunningOnValgrind();
119
+
120
+ } // namespace re2
121
+
122
+ #include "util/arena.h"
123
+ #include "util/logging.h"
124
+ #include "util/mutex.h"
125
+ #include "util/utf.h"
126
+
127
+ #endif // RE2_UTIL_UTIL_H__