chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/util/test.h
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_TEST_H__
|
6
|
+
#define RE2_UTIL_TEST_H__
|
7
|
+
|
8
|
+
#include "util/util.h"
|
9
|
+
#include "util/flags.h"
|
10
|
+
|
11
|
+
#define TEST(x, y) \
|
12
|
+
void x##y(void); \
|
13
|
+
TestRegisterer r##x##y(x##y, # x "." # y); \
|
14
|
+
void x##y(void)
|
15
|
+
|
16
|
+
void RegisterTest(void (*)(void), const char*);
|
17
|
+
|
18
|
+
class TestRegisterer {
|
19
|
+
public:
|
20
|
+
TestRegisterer(void (*fn)(void), const char *s) {
|
21
|
+
RegisterTest(fn, s);
|
22
|
+
}
|
23
|
+
};
|
24
|
+
|
25
|
+
// TODO(rsc): Do a better job.
|
26
|
+
#define EXPECT_EQ CHECK_EQ
|
27
|
+
#define EXPECT_TRUE CHECK
|
28
|
+
#define EXPECT_LT CHECK_LT
|
29
|
+
#define EXPECT_GT CHECK_GT
|
30
|
+
#define EXPECT_LE CHECK_LE
|
31
|
+
#define EXPECT_GE CHECK_GE
|
32
|
+
#define EXPECT_FALSE(x) CHECK(!(x))
|
33
|
+
|
34
|
+
#define ARRAYSIZE arraysize
|
35
|
+
|
36
|
+
#define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
|
37
|
+
#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
|
38
|
+
#define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
|
39
|
+
#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
|
40
|
+
|
41
|
+
const bool UsingMallocCounter = false;
|
42
|
+
namespace testing {
|
43
|
+
class MallocCounter {
|
44
|
+
public:
|
45
|
+
MallocCounter(int x) { }
|
46
|
+
static const int THIS_THREAD_ONLY = 0;
|
47
|
+
long long HeapGrowth() { return 0; }
|
48
|
+
long long PeakHeapGrowth() { return 0; }
|
49
|
+
void Reset() { }
|
50
|
+
};
|
51
|
+
} // namespace testing
|
52
|
+
|
53
|
+
namespace re2 {
|
54
|
+
int64 VirtualProcessSize();
|
55
|
+
} // namespace re2
|
56
|
+
|
57
|
+
#endif // RE2_UTIL_TEST_H__
|
data/ext/util/thread.h
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_THREAD_H__
|
6
|
+
#define RE2_UTIL_THREAD_H__
|
7
|
+
|
8
|
+
#include <pthread.h>
|
9
|
+
|
10
|
+
class Thread {
|
11
|
+
public:
|
12
|
+
Thread();
|
13
|
+
virtual ~Thread();
|
14
|
+
void Start();
|
15
|
+
void Join();
|
16
|
+
void SetJoinable(bool);
|
17
|
+
virtual void Run() = 0;
|
18
|
+
|
19
|
+
private:
|
20
|
+
pthread_t pid_;
|
21
|
+
bool running_;
|
22
|
+
bool joinable_;
|
23
|
+
};
|
24
|
+
|
25
|
+
#endif // RE2_UTIL_THREAD_H__
|
26
|
+
|
data/ext/util/utf.h
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* The authors of this software are Rob Pike and Ken Thompson.
|
3
|
+
* Copyright (c) 2002 by Lucent Technologies.
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose without fee is hereby granted, provided that this entire notice
|
6
|
+
* is included in all copies of any software which is or includes a copy
|
7
|
+
* or modification of this software and in all copies of the supporting
|
8
|
+
* documentation for such software.
|
9
|
+
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
10
|
+
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
11
|
+
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
12
|
+
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
13
|
+
*
|
14
|
+
* This file and rune.cc have been converted to compile as C++ code
|
15
|
+
* in name space re2.
|
16
|
+
*/
|
17
|
+
#ifndef RE2_UTIL_UTF_H__
|
18
|
+
#define RE2_UTIL_UTF_H__
|
19
|
+
|
20
|
+
#include <stdint.h>
|
21
|
+
|
22
|
+
namespace re2 {
|
23
|
+
|
24
|
+
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
25
|
+
|
26
|
+
enum
|
27
|
+
{
|
28
|
+
UTFmax = 4, /* maximum bytes per rune */
|
29
|
+
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
30
|
+
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
31
|
+
Runeerror = 0xFFFD, /* decoding error in UTF */
|
32
|
+
Runemax = 0x10FFFF, /* maximum rune value */
|
33
|
+
};
|
34
|
+
|
35
|
+
int runetochar(char* s, const Rune* r);
|
36
|
+
int chartorune(Rune* r, const char* s);
|
37
|
+
int fullrune(const char* s, int n);
|
38
|
+
int utflen(const char* s);
|
39
|
+
char* utfrune(const char*, Rune);
|
40
|
+
|
41
|
+
} // namespace re2
|
42
|
+
|
43
|
+
#endif // RE2_UTIL_UTF_H__
|
data/ext/util/util.h
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_UTIL_H__
|
6
|
+
#define RE2_UTIL_UTIL_H__
|
7
|
+
|
8
|
+
// C
|
9
|
+
#include <stdio.h>
|
10
|
+
#include <string.h>
|
11
|
+
#include <stdint.h>
|
12
|
+
#include <stddef.h> // For size_t
|
13
|
+
#include <assert.h>
|
14
|
+
#include <stdarg.h>
|
15
|
+
#include <sys/time.h>
|
16
|
+
#include <time.h>
|
17
|
+
|
18
|
+
// C++
|
19
|
+
#include <vector>
|
20
|
+
#include <string>
|
21
|
+
#include <algorithm>
|
22
|
+
#include <iosfwd>
|
23
|
+
#include <map>
|
24
|
+
#include <stack>
|
25
|
+
#include <iostream>
|
26
|
+
#include <utility>
|
27
|
+
#include <set>
|
28
|
+
|
29
|
+
// Use std names.
|
30
|
+
using std::set;
|
31
|
+
using std::pair;
|
32
|
+
using std::vector;
|
33
|
+
using std::string;
|
34
|
+
using std::min;
|
35
|
+
using std::max;
|
36
|
+
using std::ostream;
|
37
|
+
using std::map;
|
38
|
+
using std::stack;
|
39
|
+
using std::sort;
|
40
|
+
using std::swap;
|
41
|
+
using std::make_pair;
|
42
|
+
|
43
|
+
#if defined(__GNUC__) && !defined(USE_CXX0X)
|
44
|
+
|
45
|
+
#include <tr1/unordered_set>
|
46
|
+
using std::tr1::unordered_set;
|
47
|
+
|
48
|
+
#else
|
49
|
+
|
50
|
+
#include <unordered_set>
|
51
|
+
using std::unordered_set;
|
52
|
+
|
53
|
+
#endif
|
54
|
+
|
55
|
+
namespace re2 {
|
56
|
+
|
57
|
+
typedef int8_t int8;
|
58
|
+
typedef uint8_t uint8;
|
59
|
+
typedef int16_t int16;
|
60
|
+
typedef uint16_t uint16;
|
61
|
+
typedef int32_t int32;
|
62
|
+
typedef uint32_t uint32;
|
63
|
+
typedef int64_t int64;
|
64
|
+
typedef uint64_t uint64;
|
65
|
+
|
66
|
+
typedef unsigned long ulong;
|
67
|
+
typedef unsigned int uint;
|
68
|
+
typedef unsigned short ushort;
|
69
|
+
|
70
|
+
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
|
71
|
+
template<bool> struct CompileAssert {};
|
72
|
+
#define COMPILE_ASSERT(expr, msg) \
|
73
|
+
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
74
|
+
|
75
|
+
// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
|
76
|
+
// It goes in the private: declarations in a class.
|
77
|
+
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
|
78
|
+
TypeName(const TypeName&); \
|
79
|
+
void operator=(const TypeName&)
|
80
|
+
|
81
|
+
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
|
82
|
+
|
83
|
+
// Fake lock annotations. For real ones, see
|
84
|
+
// http://code.google.com/p/data-race-test/
|
85
|
+
#define ANNOTATE_PUBLISH_MEMORY_RANGE(a, b)
|
86
|
+
#define ANNOTATE_IGNORE_WRITES_BEGIN()
|
87
|
+
#define ANNOTATE_IGNORE_WRITES_END()
|
88
|
+
#define ANNOTATE_BENIGN_RACE(a, b)
|
89
|
+
#define NO_THREAD_SAFETY_ANALYSIS
|
90
|
+
#define ANNOTATE_HAPPENS_BEFORE(x)
|
91
|
+
#define ANNOTATE_HAPPENS_AFTER(x)
|
92
|
+
|
93
|
+
class StringPiece;
|
94
|
+
|
95
|
+
string CEscape(const StringPiece& src);
|
96
|
+
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
|
97
|
+
|
98
|
+
extern string StringPrintf(const char* format, ...);
|
99
|
+
extern void SStringPrintf(string* dst, const char* format, ...);
|
100
|
+
extern void StringAppendF(string* dst, const char* format, ...);
|
101
|
+
extern string PrefixSuccessor(const StringPiece& prefix);
|
102
|
+
|
103
|
+
uint32 hashword(const uint32*, size_t, uint32);
|
104
|
+
void hashword2(const uint32*, size_t, uint32*, uint32*);
|
105
|
+
|
106
|
+
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
|
107
|
+
return hashword((uint32*)s, len/4, seed);
|
108
|
+
}
|
109
|
+
|
110
|
+
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
|
111
|
+
uint32 x, y;
|
112
|
+
x = seed;
|
113
|
+
y = 0;
|
114
|
+
hashword2((uint32*)s, len/4, &x, &y);
|
115
|
+
return ((uint64)x << 32) | y;
|
116
|
+
}
|
117
|
+
|
118
|
+
int RunningOnValgrind();
|
119
|
+
|
120
|
+
} // namespace re2
|
121
|
+
|
122
|
+
#include "util/arena.h"
|
123
|
+
#include "util/logging.h"
|
124
|
+
#include "util/mutex.h"
|
125
|
+
#include "util/utf.h"
|
126
|
+
|
127
|
+
#endif // RE2_UTIL_UTIL_H__
|