chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/test/helper.rb
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
require 'helper'
|
|
3
|
+
|
|
4
|
+
describe 'Chipper entities' do
|
|
5
|
+
before do
|
|
6
|
+
@tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://t.co/IsSh1t12"
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
after do
|
|
10
|
+
Chipper.skip_users nil
|
|
11
|
+
Chipper.skip_hashtags nil
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it 'should extract users' do
|
|
15
|
+
Chipper.users(@tweet).must_equal %w(@youtube @apple)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'should extract hashtags' do
|
|
19
|
+
Chipper.hashtags(@tweet).must_equal %w(#cnn #removed)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'should extract urls' do
|
|
23
|
+
Chipper.urls(@tweet).must_equal %w(http://t.co/IsSh1t12)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# NB, these should be only [a-zA-Z0-9]+ in the path in fact, not /w as that will include unicode chars
|
|
27
|
+
it 'should extract t.co urls cleanly w/quote' do
|
|
28
|
+
text = "Hello http://t.co/97CLxVkD\" http://t.co/12345678, http://t.co/xxxxxxxx. http://t.co/xxxx1234' damn!"
|
|
29
|
+
urls = %w(http://t.co/97CLxVkD http://t.co/12345678 http://t.co/xxxxxxxx http://t.co/xxxx1234)
|
|
30
|
+
Chipper.urls(text).must_equal(urls)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'should skip users' do
|
|
34
|
+
Chipper.skip_users(%w(youtube))
|
|
35
|
+
Chipper.users(@tweet).must_equal %w(@apple)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'should skip hashtags' do
|
|
39
|
+
Chipper.skip_hashtags(%w(cnn))
|
|
40
|
+
Chipper.hashtags(@tweet).must_equal %w(#removed)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'should return all entities using #entities method' do
|
|
44
|
+
expected = {}
|
|
45
|
+
expected.merge! users: %w(@youtube @apple)
|
|
46
|
+
expected.merge! hashtags: %w(#cnn #removed)
|
|
47
|
+
expected.merge! urls: %w(http://t.co/IsSh1t12)
|
|
48
|
+
expected.merge! tokens: [["why"], ["that", "stories", "abt"], ["always", "get"], ["from"]]
|
|
49
|
+
|
|
50
|
+
Chipper.entities(@tweet).must_equal expected
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it 'should work around shitty urls' do
|
|
54
|
+
crap = "foo bar https://t.co/KCZSuVx½"
|
|
55
|
+
Chipper.urls(crap).must_equal ['https://t.co/KCZSuVx']
|
|
56
|
+
end
|
|
57
|
+
end
|
data/test/test_tokens.rb
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
require 'helper'
|
|
2
|
+
|
|
3
|
+
describe 'Chipper tokens' do
|
|
4
|
+
before do
|
|
5
|
+
@tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://www1.youtube.com/videos/?"
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
after do
|
|
9
|
+
Chipper.skip_tokens nil
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'should extract tokens' do
|
|
13
|
+
expected = [%w(why), %w(that stories abt), %w(always get), %w(from)]
|
|
14
|
+
Chipper.tokens(@tweet).must_equal expected
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'should skip tokens' do
|
|
18
|
+
expected = [%w(why), %w(that), %w(abt), %w(get)]
|
|
19
|
+
Chipper.skip_tokens(%w(story always from video))
|
|
20
|
+
Chipper.tokens(@tweet).must_equal expected
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'should skip numbers dates and times' do
|
|
24
|
+
expected = [["flubble","bubble"],["champion"]]
|
|
25
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
|
|
26
|
+
Chipper.tokens("flubble bubble I or 123454 123-123-1111 19 on te champion 19th at 4am or 12pm").must_equal expected
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'should skip random other chars dates and times' do
|
|
30
|
+
expected = [["flubble","bubble"],["champion","winter"],["incredible"]]
|
|
31
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
|
|
32
|
+
Chipper.tokens("flubble_bubble ^o^ champion_winter oh_____ _______ ^incredible ").must_equal expected
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'should be lowercase' do
|
|
36
|
+
expected = [["flubble","bubble"]]
|
|
37
|
+
Chipper.tokens("FLUBBLE BUBBLE").must_equal expected
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it 'stop words should not be case sensitive' do
|
|
41
|
+
expected = [["pancakes"]]
|
|
42
|
+
Chipper.skip_tokens(%w(christmas))
|
|
43
|
+
Chipper.tokens("pancakes in Christmas").must_equal expected
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it 'should filter stemmed words that are too short' do
|
|
47
|
+
expected = [["flubble","bubble"]]
|
|
48
|
+
Chipper.tokens("I am going to doing, being flubble bubble its").must_equal expected
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# eg don't, won't aren't etc. Quick 'cheats' way to do this is just sub out all single quotes with nothing.
|
|
52
|
+
it 'should remove abbreviations with single quotes' do
|
|
53
|
+
expected = [["flubble","bubble","dont"]]
|
|
54
|
+
Chipper.tokens("flubble bubble don't do it").must_equal expected
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it 'should segment across urls' do
|
|
58
|
+
expected = [%w(hello world), %w(this), %w(might work)]
|
|
59
|
+
Chipper.tokens('hello world, this http://www.example.com/1 might work').must_equal expected
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
describe 'segment across short, stop and non-words' do
|
|
63
|
+
before do
|
|
64
|
+
@expected = [%w(flopper bopper), %w(dopper)]
|
|
65
|
+
Chipper.skip_tokens(%w(four five six stopper))
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
after do
|
|
69
|
+
Chipper.skip_tokens nil
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'should segment correctly on short word' do
|
|
73
|
+
Chipper.tokens('Flopper Bopper at Dopper').must_equal @expected
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it 'should segment correctly on stop word' do
|
|
77
|
+
Chipper.tokens('Flopper Bopper STOPPER Dopper').must_equal @expected
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'should segment correctly on non-word' do
|
|
81
|
+
Chipper.tokens('Flopper Bopper. Dopper').must_equal @expected
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'should not get stuffs from users or hashtags' do
|
|
85
|
+
Chipper.tokens("melbourne @sydney_islame or #brisbane_humid").must_equal [["melbourne"]]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
it 'should not add random newlines becasue there is underscores, full stop, space then url' do
|
|
89
|
+
Chipper.tokens("one__two.three http://t.co").must_equal [["one","two"],["three"]]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'should not add random newlines becasue there is leading underscore colon space then url' do
|
|
93
|
+
Chipper.tokens('bushes _purple: http://t.co/').must_equal [["bushes","purple"]]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it 'should group and segment across repeated or anchored underscores' do
|
|
97
|
+
Chipper.tokens('_foo bar._baz: hello__world! http://t.co/').must_equal [%w(foo bar), %w(baz), %w(hello world)]
|
|
98
|
+
Chipper.tokens('_foo bar._baz:_hello #hi,world').must_equal [%w(foo bar), %w(baz), %w(hello), %w(world)]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
describe 'unicode' do
|
|
103
|
+
it 'should skip quotes and handle fullwidth @' do
|
|
104
|
+
text = "hello world, ain\u2019t this \uff20cool huh\u201d"
|
|
105
|
+
Chipper.tokens(text).must_equal [["hello", "world"], ["aint", "this"], ["huh"]]
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it 'should kill tokens that start with unicode' do
|
|
109
|
+
text = "hello world, \u2020\uff26\u201f \u2021\uff36\u210fcool 12TH 123345134 awesome\u2020\uff26\u201f"
|
|
110
|
+
Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$|^\W.*$}
|
|
111
|
+
Chipper.tokens(text).must_equal [["hello", "world"], ["awesome\u2020\uff26"]]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it 'should segment on unicode puncutation' do
|
|
115
|
+
Chipper.tokens("\u2039Hello\u203aWorld").must_equal [["hello"], ["world"]]
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: chipper
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 4
|
|
8
|
+
- 2
|
|
9
|
+
version: 0.4.2
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Bharanee Rathna
|
|
13
|
+
- John Barratt
|
|
14
|
+
autorequire:
|
|
15
|
+
bindir: bin
|
|
16
|
+
cert_chain: []
|
|
17
|
+
|
|
18
|
+
date: 2012-01-16 00:00:00 +11:00
|
|
19
|
+
default_executable:
|
|
20
|
+
dependencies: []
|
|
21
|
+
|
|
22
|
+
description: twitter text extraction utilities
|
|
23
|
+
email:
|
|
24
|
+
- deepfryed@gmail.com
|
|
25
|
+
- djon00@gmail.com
|
|
26
|
+
executables: []
|
|
27
|
+
|
|
28
|
+
extensions:
|
|
29
|
+
- ext/extconf.rb
|
|
30
|
+
extra_rdoc_files:
|
|
31
|
+
- README.rdoc
|
|
32
|
+
files:
|
|
33
|
+
- ext/src/chipper.cc
|
|
34
|
+
- ext/re2/filtered_re2.cc
|
|
35
|
+
- ext/re2/unicode_casefold.cc
|
|
36
|
+
- ext/re2/prefilter.cc
|
|
37
|
+
- ext/re2/prefilter_tree.cc
|
|
38
|
+
- ext/re2/re2.cc
|
|
39
|
+
- ext/re2/valgrind.cc
|
|
40
|
+
- ext/re2/hash.cc
|
|
41
|
+
- ext/re2/parse.cc
|
|
42
|
+
- ext/re2/stringpiece.cc
|
|
43
|
+
- ext/re2/set.cc
|
|
44
|
+
- ext/re2/bitstate.cc
|
|
45
|
+
- ext/re2/prog.cc
|
|
46
|
+
- ext/re2/simplify.cc
|
|
47
|
+
- ext/re2/rune.cc
|
|
48
|
+
- ext/re2/dfa.cc
|
|
49
|
+
- ext/re2/onepass.cc
|
|
50
|
+
- ext/re2/unicode_groups.cc
|
|
51
|
+
- ext/re2/regexp.cc
|
|
52
|
+
- ext/re2/nfa.cc
|
|
53
|
+
- ext/re2/perl_groups.cc
|
|
54
|
+
- ext/re2/mimics_pcre.cc
|
|
55
|
+
- ext/re2/compile.cc
|
|
56
|
+
- ext/re2/tostring.cc
|
|
57
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_russian.c
|
|
58
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c
|
|
59
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c
|
|
60
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
|
|
61
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_danish.c
|
|
62
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
|
|
63
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
|
|
64
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_swedish.c
|
|
65
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
|
|
66
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
|
|
67
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
|
|
68
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
|
|
69
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_german.c
|
|
70
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_romanian.c
|
|
71
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_turkish.c
|
|
72
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_italian.c
|
|
73
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
|
|
74
|
+
- ext/libstemmer_c/src_c/stem_KOI8_R_russian.c
|
|
75
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c
|
|
76
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c
|
|
77
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c
|
|
78
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_french.c
|
|
79
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_english.c
|
|
80
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_finnish.c
|
|
81
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
|
|
82
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
|
|
83
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_porter.c
|
|
84
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_spanish.c
|
|
85
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
|
|
86
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_dutch.c
|
|
87
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c
|
|
88
|
+
- ext/libstemmer_c/libstemmer/libstemmer.c
|
|
89
|
+
- ext/libstemmer_c/libstemmer/libstemmer_utf8.c
|
|
90
|
+
- ext/libstemmer_c/runtime/api.c
|
|
91
|
+
- ext/libstemmer_c/runtime/utilities.c
|
|
92
|
+
- ext/libstemmer_c/examples/stemwords.c
|
|
93
|
+
- ext/libstemmer_c/mkinc_utf8.mak
|
|
94
|
+
- ext/libstemmer_c/mkinc.mak
|
|
95
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
|
|
96
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_porter.h
|
|
97
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_danish.h
|
|
98
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_english.h
|
|
99
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h
|
|
100
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
|
|
101
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_romanian.h
|
|
102
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h
|
|
103
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
|
|
104
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h
|
|
105
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h
|
|
106
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h
|
|
107
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
|
|
108
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
|
|
109
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_finnish.h
|
|
110
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_italian.h
|
|
111
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_russian.h
|
|
112
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_spanish.h
|
|
113
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_swedish.h
|
|
114
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_french.h
|
|
115
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h
|
|
116
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_turkish.h
|
|
117
|
+
- ext/libstemmer_c/src_c/stem_KOI8_R_russian.h
|
|
118
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_german.h
|
|
119
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
|
|
120
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
|
|
121
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
|
|
122
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
|
|
123
|
+
- ext/libstemmer_c/src_c/stem_UTF_8_dutch.h
|
|
124
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
|
|
125
|
+
- ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
|
|
126
|
+
- ext/libstemmer_c/include/libstemmer.h
|
|
127
|
+
- ext/libstemmer_c/libstemmer/modules_utf8.h
|
|
128
|
+
- ext/libstemmer_c/libstemmer/modules.h
|
|
129
|
+
- ext/libstemmer_c/runtime/header.h
|
|
130
|
+
- ext/libstemmer_c/runtime/api.h
|
|
131
|
+
- ext/src/version.h
|
|
132
|
+
- ext/util/benchmark.h
|
|
133
|
+
- ext/util/arena.h
|
|
134
|
+
- ext/util/thread.h
|
|
135
|
+
- ext/util/mutex.h
|
|
136
|
+
- ext/util/atomicops.h
|
|
137
|
+
- ext/util/valgrind.h
|
|
138
|
+
- ext/util/test.h
|
|
139
|
+
- ext/util/random.h
|
|
140
|
+
- ext/util/sparse_set.h
|
|
141
|
+
- ext/util/utf.h
|
|
142
|
+
- ext/util/pcre.h
|
|
143
|
+
- ext/util/flags.h
|
|
144
|
+
- ext/util/util.h
|
|
145
|
+
- ext/util/sparse_array.h
|
|
146
|
+
- ext/util/logging.h
|
|
147
|
+
- ext/re2/stringpiece.h
|
|
148
|
+
- ext/re2/prefilter_tree.h
|
|
149
|
+
- ext/re2/prefilter.h
|
|
150
|
+
- ext/re2/walker-inl.h
|
|
151
|
+
- ext/re2/variadic_function.h
|
|
152
|
+
- ext/re2/set.h
|
|
153
|
+
- ext/re2/re2.h
|
|
154
|
+
- ext/re2/regexp.h
|
|
155
|
+
- ext/re2/filtered_re2.h
|
|
156
|
+
- ext/re2/unicode_casefold.h
|
|
157
|
+
- ext/re2/prog.h
|
|
158
|
+
- ext/re2/unicode_groups.h
|
|
159
|
+
- ext/extconf.rb
|
|
160
|
+
- ext/stemmer.rb
|
|
161
|
+
- test/test_tokens.rb
|
|
162
|
+
- test/helper.rb
|
|
163
|
+
- test/test_entities.rb
|
|
164
|
+
- ext/libstemmer_c/Makefile
|
|
165
|
+
- README.rdoc
|
|
166
|
+
has_rdoc: true
|
|
167
|
+
homepage: http://github.com/deepfryed/chipper
|
|
168
|
+
licenses: []
|
|
169
|
+
|
|
170
|
+
post_install_message:
|
|
171
|
+
rdoc_options: []
|
|
172
|
+
|
|
173
|
+
require_paths:
|
|
174
|
+
- lib
|
|
175
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
176
|
+
none: false
|
|
177
|
+
requirements:
|
|
178
|
+
- - ">="
|
|
179
|
+
- !ruby/object:Gem::Version
|
|
180
|
+
segments:
|
|
181
|
+
- 0
|
|
182
|
+
version: "0"
|
|
183
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
184
|
+
none: false
|
|
185
|
+
requirements:
|
|
186
|
+
- - ">="
|
|
187
|
+
- !ruby/object:Gem::Version
|
|
188
|
+
segments:
|
|
189
|
+
- 0
|
|
190
|
+
version: "0"
|
|
191
|
+
requirements: []
|
|
192
|
+
|
|
193
|
+
rubyforge_project:
|
|
194
|
+
rubygems_version: 1.3.7
|
|
195
|
+
signing_key:
|
|
196
|
+
specification_version: 3
|
|
197
|
+
summary: twitter text extractor
|
|
198
|
+
test_files: []
|
|
199
|
+
|