chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/test/helper.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
describe 'Chipper entities' do
|
5
|
+
before do
|
6
|
+
@tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://t.co/IsSh1t12"
|
7
|
+
end
|
8
|
+
|
9
|
+
after do
|
10
|
+
Chipper.skip_users nil
|
11
|
+
Chipper.skip_hashtags nil
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should extract users' do
|
15
|
+
Chipper.users(@tweet).must_equal %w(@youtube @apple)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should extract hashtags' do
|
19
|
+
Chipper.hashtags(@tweet).must_equal %w(#cnn #removed)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should extract urls' do
|
23
|
+
Chipper.urls(@tweet).must_equal %w(http://t.co/IsSh1t12)
|
24
|
+
end
|
25
|
+
|
26
|
+
# NB, these should be only [a-zA-Z0-9]+ in the path in fact, not /w as that will include unicode chars
|
27
|
+
it 'should extract t.co urls cleanly w/quote' do
|
28
|
+
text = "Hello http://t.co/97CLxVkD\" http://t.co/12345678, http://t.co/xxxxxxxx. http://t.co/xxxx1234' damn!"
|
29
|
+
urls = %w(http://t.co/97CLxVkD http://t.co/12345678 http://t.co/xxxxxxxx http://t.co/xxxx1234)
|
30
|
+
Chipper.urls(text).must_equal(urls)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should skip users' do
|
34
|
+
Chipper.skip_users(%w(youtube))
|
35
|
+
Chipper.users(@tweet).must_equal %w(@apple)
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should skip hashtags' do
|
39
|
+
Chipper.skip_hashtags(%w(cnn))
|
40
|
+
Chipper.hashtags(@tweet).must_equal %w(#removed)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return all entities using #entities method' do
|
44
|
+
expected = {}
|
45
|
+
expected.merge! users: %w(@youtube @apple)
|
46
|
+
expected.merge! hashtags: %w(#cnn #removed)
|
47
|
+
expected.merge! urls: %w(http://t.co/IsSh1t12)
|
48
|
+
expected.merge! tokens: [["why"], ["that", "stories", "abt"], ["always", "get"], ["from"]]
|
49
|
+
|
50
|
+
Chipper.entities(@tweet).must_equal expected
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should work around shitty urls' do
|
54
|
+
crap = "foo bar https://t.co/KCZSuVx½"
|
55
|
+
Chipper.urls(crap).must_equal ['https://t.co/KCZSuVx']
|
56
|
+
end
|
57
|
+
end
|
data/test/test_tokens.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe 'Chipper tokens' do
|
4
|
+
before do
|
5
|
+
@tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://www1.youtube.com/videos/?"
|
6
|
+
end
|
7
|
+
|
8
|
+
after do
|
9
|
+
Chipper.skip_tokens nil
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should extract tokens' do
|
13
|
+
expected = [%w(why), %w(that stories abt), %w(always get), %w(from)]
|
14
|
+
Chipper.tokens(@tweet).must_equal expected
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should skip tokens' do
|
18
|
+
expected = [%w(why), %w(that), %w(abt), %w(get)]
|
19
|
+
Chipper.skip_tokens(%w(story always from video))
|
20
|
+
Chipper.tokens(@tweet).must_equal expected
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should skip numbers dates and times' do
|
24
|
+
expected = [["flubble","bubble"],["champion"]]
|
25
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
|
26
|
+
Chipper.tokens("flubble bubble I or 123454 123-123-1111 19 on te champion 19th at 4am or 12pm").must_equal expected
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should skip random other chars dates and times' do
|
30
|
+
expected = [["flubble","bubble"],["champion","winter"],["incredible"]]
|
31
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
|
32
|
+
Chipper.tokens("flubble_bubble ^o^ champion_winter oh_____ _______ ^incredible ").must_equal expected
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should be lowercase' do
|
36
|
+
expected = [["flubble","bubble"]]
|
37
|
+
Chipper.tokens("FLUBBLE BUBBLE").must_equal expected
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'stop words should not be case sensitive' do
|
41
|
+
expected = [["pancakes"]]
|
42
|
+
Chipper.skip_tokens(%w(christmas))
|
43
|
+
Chipper.tokens("pancakes in Christmas").must_equal expected
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should filter stemmed words that are too short' do
|
47
|
+
expected = [["flubble","bubble"]]
|
48
|
+
Chipper.tokens("I am going to doing, being flubble bubble its").must_equal expected
|
49
|
+
end
|
50
|
+
|
51
|
+
# eg don't, won't aren't etc. Quick 'cheats' way to do this is just sub out all single quotes with nothing.
|
52
|
+
it 'should remove abbreviations with single quotes' do
|
53
|
+
expected = [["flubble","bubble","dont"]]
|
54
|
+
Chipper.tokens("flubble bubble don't do it").must_equal expected
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should segment across urls' do
|
58
|
+
expected = [%w(hello world), %w(this), %w(might work)]
|
59
|
+
Chipper.tokens('hello world, this http://www.example.com/1 might work').must_equal expected
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'segment across short, stop and non-words' do
|
63
|
+
before do
|
64
|
+
@expected = [%w(flopper bopper), %w(dopper)]
|
65
|
+
Chipper.skip_tokens(%w(four five six stopper))
|
66
|
+
end
|
67
|
+
|
68
|
+
after do
|
69
|
+
Chipper.skip_tokens nil
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should segment correctly on short word' do
|
73
|
+
Chipper.tokens('Flopper Bopper at Dopper').must_equal @expected
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should segment correctly on stop word' do
|
77
|
+
Chipper.tokens('Flopper Bopper STOPPER Dopper').must_equal @expected
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'should segment correctly on non-word' do
|
81
|
+
Chipper.tokens('Flopper Bopper. Dopper').must_equal @expected
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should not get stuffs from users or hashtags' do
|
85
|
+
Chipper.tokens("melbourne @sydney_islame or #brisbane_humid").must_equal [["melbourne"]]
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should not add random newlines becasue there is underscores, full stop, space then url' do
|
89
|
+
Chipper.tokens("one__two.three http://t.co").must_equal [["one","two"],["three"]]
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'should not add random newlines becasue there is leading underscore colon space then url' do
|
93
|
+
Chipper.tokens('bushes _purple: http://t.co/').must_equal [["bushes","purple"]]
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'should group and segment across repeated or anchored underscores' do
|
97
|
+
Chipper.tokens('_foo bar._baz: hello__world! http://t.co/').must_equal [%w(foo bar), %w(baz), %w(hello world)]
|
98
|
+
Chipper.tokens('_foo bar._baz:_hello #hi,world').must_equal [%w(foo bar), %w(baz), %w(hello), %w(world)]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
describe 'unicode' do
|
103
|
+
it 'should skip quotes and handle fullwidth @' do
|
104
|
+
text = "hello world, ain\u2019t this \uff20cool huh\u201d"
|
105
|
+
Chipper.tokens(text).must_equal [["hello", "world"], ["aint", "this"], ["huh"]]
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'should kill tokens that start with unicode' do
|
109
|
+
text = "hello world, \u2020\uff26\u201f \u2021\uff36\u210fcool 12TH 123345134 awesome\u2020\uff26\u201f"
|
110
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$|^\W.*$}
|
111
|
+
Chipper.tokens(text).must_equal [["hello", "world"], ["awesome\u2020\uff26"]]
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'should segment on unicode puncutation' do
|
115
|
+
Chipper.tokens("\u2039Hello\u203aWorld").must_equal [["hello"], ["world"]]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
metadata
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: chipper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 4
|
8
|
+
- 2
|
9
|
+
version: 0.4.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Bharanee Rathna
|
13
|
+
- John Barratt
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-01-16 00:00:00 +11:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: twitter text extraction utilities
|
23
|
+
email:
|
24
|
+
- deepfryed@gmail.com
|
25
|
+
- djon00@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions:
|
29
|
+
- ext/extconf.rb
|
30
|
+
extra_rdoc_files:
|
31
|
+
- README.rdoc
|
32
|
+
files:
|
33
|
+
- ext/src/chipper.cc
|
34
|
+
- ext/re2/filtered_re2.cc
|
35
|
+
- ext/re2/unicode_casefold.cc
|
36
|
+
- ext/re2/prefilter.cc
|
37
|
+
- ext/re2/prefilter_tree.cc
|
38
|
+
- ext/re2/re2.cc
|
39
|
+
- ext/re2/valgrind.cc
|
40
|
+
- ext/re2/hash.cc
|
41
|
+
- ext/re2/parse.cc
|
42
|
+
- ext/re2/stringpiece.cc
|
43
|
+
- ext/re2/set.cc
|
44
|
+
- ext/re2/bitstate.cc
|
45
|
+
- ext/re2/prog.cc
|
46
|
+
- ext/re2/simplify.cc
|
47
|
+
- ext/re2/rune.cc
|
48
|
+
- ext/re2/dfa.cc
|
49
|
+
- ext/re2/onepass.cc
|
50
|
+
- ext/re2/unicode_groups.cc
|
51
|
+
- ext/re2/regexp.cc
|
52
|
+
- ext/re2/nfa.cc
|
53
|
+
- ext/re2/perl_groups.cc
|
54
|
+
- ext/re2/mimics_pcre.cc
|
55
|
+
- ext/re2/compile.cc
|
56
|
+
- ext/re2/tostring.cc
|
57
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_russian.c
|
58
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c
|
59
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c
|
60
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
|
61
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_danish.c
|
62
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
|
63
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
|
64
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_swedish.c
|
65
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
|
66
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
|
67
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
|
68
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
|
69
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_german.c
|
70
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_romanian.c
|
71
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_turkish.c
|
72
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_italian.c
|
73
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
|
74
|
+
- ext/libstemmer_c/src_c/stem_KOI8_R_russian.c
|
75
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c
|
76
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c
|
77
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c
|
78
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_french.c
|
79
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_english.c
|
80
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_finnish.c
|
81
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
|
82
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
|
83
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_porter.c
|
84
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_spanish.c
|
85
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
|
86
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_dutch.c
|
87
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c
|
88
|
+
- ext/libstemmer_c/libstemmer/libstemmer.c
|
89
|
+
- ext/libstemmer_c/libstemmer/libstemmer_utf8.c
|
90
|
+
- ext/libstemmer_c/runtime/api.c
|
91
|
+
- ext/libstemmer_c/runtime/utilities.c
|
92
|
+
- ext/libstemmer_c/examples/stemwords.c
|
93
|
+
- ext/libstemmer_c/mkinc_utf8.mak
|
94
|
+
- ext/libstemmer_c/mkinc.mak
|
95
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
|
96
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_porter.h
|
97
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_danish.h
|
98
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_english.h
|
99
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h
|
100
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
|
101
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_romanian.h
|
102
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h
|
103
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
|
104
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h
|
105
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h
|
106
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h
|
107
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
|
108
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
|
109
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_finnish.h
|
110
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_italian.h
|
111
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_russian.h
|
112
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_spanish.h
|
113
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_swedish.h
|
114
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_french.h
|
115
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h
|
116
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_turkish.h
|
117
|
+
- ext/libstemmer_c/src_c/stem_KOI8_R_russian.h
|
118
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_german.h
|
119
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
|
120
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
|
121
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
|
122
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
|
123
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_dutch.h
|
124
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
|
125
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
|
126
|
+
- ext/libstemmer_c/include/libstemmer.h
|
127
|
+
- ext/libstemmer_c/libstemmer/modules_utf8.h
|
128
|
+
- ext/libstemmer_c/libstemmer/modules.h
|
129
|
+
- ext/libstemmer_c/runtime/header.h
|
130
|
+
- ext/libstemmer_c/runtime/api.h
|
131
|
+
- ext/src/version.h
|
132
|
+
- ext/util/benchmark.h
|
133
|
+
- ext/util/arena.h
|
134
|
+
- ext/util/thread.h
|
135
|
+
- ext/util/mutex.h
|
136
|
+
- ext/util/atomicops.h
|
137
|
+
- ext/util/valgrind.h
|
138
|
+
- ext/util/test.h
|
139
|
+
- ext/util/random.h
|
140
|
+
- ext/util/sparse_set.h
|
141
|
+
- ext/util/utf.h
|
142
|
+
- ext/util/pcre.h
|
143
|
+
- ext/util/flags.h
|
144
|
+
- ext/util/util.h
|
145
|
+
- ext/util/sparse_array.h
|
146
|
+
- ext/util/logging.h
|
147
|
+
- ext/re2/stringpiece.h
|
148
|
+
- ext/re2/prefilter_tree.h
|
149
|
+
- ext/re2/prefilter.h
|
150
|
+
- ext/re2/walker-inl.h
|
151
|
+
- ext/re2/variadic_function.h
|
152
|
+
- ext/re2/set.h
|
153
|
+
- ext/re2/re2.h
|
154
|
+
- ext/re2/regexp.h
|
155
|
+
- ext/re2/filtered_re2.h
|
156
|
+
- ext/re2/unicode_casefold.h
|
157
|
+
- ext/re2/prog.h
|
158
|
+
- ext/re2/unicode_groups.h
|
159
|
+
- ext/extconf.rb
|
160
|
+
- ext/stemmer.rb
|
161
|
+
- test/test_tokens.rb
|
162
|
+
- test/helper.rb
|
163
|
+
- test/test_entities.rb
|
164
|
+
- ext/libstemmer_c/Makefile
|
165
|
+
- README.rdoc
|
166
|
+
has_rdoc: true
|
167
|
+
homepage: http://github.com/deepfryed/chipper
|
168
|
+
licenses: []
|
169
|
+
|
170
|
+
post_install_message:
|
171
|
+
rdoc_options: []
|
172
|
+
|
173
|
+
require_paths:
|
174
|
+
- lib
|
175
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
176
|
+
none: false
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
segments:
|
181
|
+
- 0
|
182
|
+
version: "0"
|
183
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
184
|
+
none: false
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
segments:
|
189
|
+
- 0
|
190
|
+
version: "0"
|
191
|
+
requirements: []
|
192
|
+
|
193
|
+
rubyforge_project:
|
194
|
+
rubygems_version: 1.3.7
|
195
|
+
signing_key:
|
196
|
+
specification_version: 3
|
197
|
+
summary: twitter text extractor
|
198
|
+
test_files: []
|
199
|
+
|