chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/util/test.h
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_TEST_H__
|
6
|
+
#define RE2_UTIL_TEST_H__
|
7
|
+
|
8
|
+
#include "util/util.h"
|
9
|
+
#include "util/flags.h"
|
10
|
+
|
11
|
+
#define TEST(x, y) \
|
12
|
+
void x##y(void); \
|
13
|
+
TestRegisterer r##x##y(x##y, # x "." # y); \
|
14
|
+
void x##y(void)
|
15
|
+
|
16
|
+
void RegisterTest(void (*)(void), const char*);
|
17
|
+
|
18
|
+
class TestRegisterer {
|
19
|
+
public:
|
20
|
+
TestRegisterer(void (*fn)(void), const char *s) {
|
21
|
+
RegisterTest(fn, s);
|
22
|
+
}
|
23
|
+
};
|
24
|
+
|
25
|
+
// TODO(rsc): Do a better job.
|
26
|
+
#define EXPECT_EQ CHECK_EQ
|
27
|
+
#define EXPECT_TRUE CHECK
|
28
|
+
#define EXPECT_LT CHECK_LT
|
29
|
+
#define EXPECT_GT CHECK_GT
|
30
|
+
#define EXPECT_LE CHECK_LE
|
31
|
+
#define EXPECT_GE CHECK_GE
|
32
|
+
#define EXPECT_FALSE(x) CHECK(!(x))
|
33
|
+
|
34
|
+
#define ARRAYSIZE arraysize
|
35
|
+
|
36
|
+
#define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
|
37
|
+
#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
|
38
|
+
#define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
|
39
|
+
#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
|
40
|
+
|
41
|
+
const bool UsingMallocCounter = false;
|
42
|
+
namespace testing {
|
43
|
+
class MallocCounter {
|
44
|
+
public:
|
45
|
+
MallocCounter(int x) { }
|
46
|
+
static const int THIS_THREAD_ONLY = 0;
|
47
|
+
long long HeapGrowth() { return 0; }
|
48
|
+
long long PeakHeapGrowth() { return 0; }
|
49
|
+
void Reset() { }
|
50
|
+
};
|
51
|
+
} // namespace testing
|
52
|
+
|
53
|
+
namespace re2 {
|
54
|
+
int64 VirtualProcessSize();
|
55
|
+
} // namespace re2
|
56
|
+
|
57
|
+
#endif // RE2_UTIL_TEST_H__
|
data/ext/util/thread.h
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_THREAD_H__
|
6
|
+
#define RE2_UTIL_THREAD_H__
|
7
|
+
|
8
|
+
#include <pthread.h>
|
9
|
+
|
10
|
+
class Thread {
|
11
|
+
public:
|
12
|
+
Thread();
|
13
|
+
virtual ~Thread();
|
14
|
+
void Start();
|
15
|
+
void Join();
|
16
|
+
void SetJoinable(bool);
|
17
|
+
virtual void Run() = 0;
|
18
|
+
|
19
|
+
private:
|
20
|
+
pthread_t pid_;
|
21
|
+
bool running_;
|
22
|
+
bool joinable_;
|
23
|
+
};
|
24
|
+
|
25
|
+
#endif // RE2_UTIL_THREAD_H__
|
26
|
+
|
data/ext/util/utf.h
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* The authors of this software are Rob Pike and Ken Thompson.
|
3
|
+
* Copyright (c) 2002 by Lucent Technologies.
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose without fee is hereby granted, provided that this entire notice
|
6
|
+
* is included in all copies of any software which is or includes a copy
|
7
|
+
* or modification of this software and in all copies of the supporting
|
8
|
+
* documentation for such software.
|
9
|
+
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
10
|
+
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
11
|
+
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
12
|
+
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
13
|
+
*
|
14
|
+
* This file and rune.cc have been converted to compile as C++ code
|
15
|
+
* in name space re2.
|
16
|
+
*/
|
17
|
+
#ifndef RE2_UTIL_UTF_H__
|
18
|
+
#define RE2_UTIL_UTF_H__
|
19
|
+
|
20
|
+
#include <stdint.h>
|
21
|
+
|
22
|
+
namespace re2 {
|
23
|
+
|
24
|
+
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
25
|
+
|
26
|
+
enum
|
27
|
+
{
|
28
|
+
UTFmax = 4, /* maximum bytes per rune */
|
29
|
+
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
30
|
+
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
31
|
+
Runeerror = 0xFFFD, /* decoding error in UTF */
|
32
|
+
Runemax = 0x10FFFF, /* maximum rune value */
|
33
|
+
};
|
34
|
+
|
35
|
+
int runetochar(char* s, const Rune* r);
|
36
|
+
int chartorune(Rune* r, const char* s);
|
37
|
+
int fullrune(const char* s, int n);
|
38
|
+
int utflen(const char* s);
|
39
|
+
char* utfrune(const char*, Rune);
|
40
|
+
|
41
|
+
} // namespace re2
|
42
|
+
|
43
|
+
#endif // RE2_UTIL_UTF_H__
|
data/ext/util/util.h
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_UTIL_UTIL_H__
|
6
|
+
#define RE2_UTIL_UTIL_H__
|
7
|
+
|
8
|
+
// C
|
9
|
+
#include <stdio.h>
|
10
|
+
#include <string.h>
|
11
|
+
#include <stdint.h>
|
12
|
+
#include <stddef.h> // For size_t
|
13
|
+
#include <assert.h>
|
14
|
+
#include <stdarg.h>
|
15
|
+
#include <sys/time.h>
|
16
|
+
#include <time.h>
|
17
|
+
|
18
|
+
// C++
|
19
|
+
#include <vector>
|
20
|
+
#include <string>
|
21
|
+
#include <algorithm>
|
22
|
+
#include <iosfwd>
|
23
|
+
#include <map>
|
24
|
+
#include <stack>
|
25
|
+
#include <iostream>
|
26
|
+
#include <utility>
|
27
|
+
#include <set>
|
28
|
+
|
29
|
+
// Use std names.
|
30
|
+
using std::set;
|
31
|
+
using std::pair;
|
32
|
+
using std::vector;
|
33
|
+
using std::string;
|
34
|
+
using std::min;
|
35
|
+
using std::max;
|
36
|
+
using std::ostream;
|
37
|
+
using std::map;
|
38
|
+
using std::stack;
|
39
|
+
using std::sort;
|
40
|
+
using std::swap;
|
41
|
+
using std::make_pair;
|
42
|
+
|
43
|
+
#if defined(__GNUC__) && !defined(USE_CXX0X)
|
44
|
+
|
45
|
+
#include <tr1/unordered_set>
|
46
|
+
using std::tr1::unordered_set;
|
47
|
+
|
48
|
+
#else
|
49
|
+
|
50
|
+
#include <unordered_set>
|
51
|
+
using std::unordered_set;
|
52
|
+
|
53
|
+
#endif
|
54
|
+
|
55
|
+
namespace re2 {
|
56
|
+
|
57
|
+
typedef int8_t int8;
|
58
|
+
typedef uint8_t uint8;
|
59
|
+
typedef int16_t int16;
|
60
|
+
typedef uint16_t uint16;
|
61
|
+
typedef int32_t int32;
|
62
|
+
typedef uint32_t uint32;
|
63
|
+
typedef int64_t int64;
|
64
|
+
typedef uint64_t uint64;
|
65
|
+
|
66
|
+
typedef unsigned long ulong;
|
67
|
+
typedef unsigned int uint;
|
68
|
+
typedef unsigned short ushort;
|
69
|
+
|
70
|
+
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
|
71
|
+
template<bool> struct CompileAssert {};
|
72
|
+
#define COMPILE_ASSERT(expr, msg) \
|
73
|
+
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
74
|
+
|
75
|
+
// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
|
76
|
+
// It goes in the private: declarations in a class.
|
77
|
+
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
|
78
|
+
TypeName(const TypeName&); \
|
79
|
+
void operator=(const TypeName&)
|
80
|
+
|
81
|
+
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
|
82
|
+
|
83
|
+
// Fake lock annotations. For real ones, see
|
84
|
+
// http://code.google.com/p/data-race-test/
|
85
|
+
#define ANNOTATE_PUBLISH_MEMORY_RANGE(a, b)
|
86
|
+
#define ANNOTATE_IGNORE_WRITES_BEGIN()
|
87
|
+
#define ANNOTATE_IGNORE_WRITES_END()
|
88
|
+
#define ANNOTATE_BENIGN_RACE(a, b)
|
89
|
+
#define NO_THREAD_SAFETY_ANALYSIS
|
90
|
+
#define ANNOTATE_HAPPENS_BEFORE(x)
|
91
|
+
#define ANNOTATE_HAPPENS_AFTER(x)
|
92
|
+
|
93
|
+
class StringPiece;
|
94
|
+
|
95
|
+
string CEscape(const StringPiece& src);
|
96
|
+
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
|
97
|
+
|
98
|
+
extern string StringPrintf(const char* format, ...);
|
99
|
+
extern void SStringPrintf(string* dst, const char* format, ...);
|
100
|
+
extern void StringAppendF(string* dst, const char* format, ...);
|
101
|
+
extern string PrefixSuccessor(const StringPiece& prefix);
|
102
|
+
|
103
|
+
uint32 hashword(const uint32*, size_t, uint32);
|
104
|
+
void hashword2(const uint32*, size_t, uint32*, uint32*);
|
105
|
+
|
106
|
+
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
|
107
|
+
return hashword((uint32*)s, len/4, seed);
|
108
|
+
}
|
109
|
+
|
110
|
+
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
|
111
|
+
uint32 x, y;
|
112
|
+
x = seed;
|
113
|
+
y = 0;
|
114
|
+
hashword2((uint32*)s, len/4, &x, &y);
|
115
|
+
return ((uint64)x << 32) | y;
|
116
|
+
}
|
117
|
+
|
118
|
+
int RunningOnValgrind();
|
119
|
+
|
120
|
+
} // namespace re2
|
121
|
+
|
122
|
+
#include "util/arena.h"
|
123
|
+
#include "util/logging.h"
|
124
|
+
#include "util/mutex.h"
|
125
|
+
#include "util/utf.h"
|
126
|
+
|
127
|
+
#endif // RE2_UTIL_UTIL_H__
|