chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ $:.unshift File.dirname(__FILE__) + '/../ext'
2
+ require 'chipper'
3
+ require 'minitest/spec'
4
+
5
+ MiniTest::Unit.autorun
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ describe 'Chipper entities' do
5
+ before do
6
+ @tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://t.co/IsSh1t12"
7
+ end
8
+
9
+ after do
10
+ Chipper.skip_users nil
11
+ Chipper.skip_hashtags nil
12
+ end
13
+
14
+ it 'should extract users' do
15
+ Chipper.users(@tweet).must_equal %w(@youtube @apple)
16
+ end
17
+
18
+ it 'should extract hashtags' do
19
+ Chipper.hashtags(@tweet).must_equal %w(#cnn #removed)
20
+ end
21
+
22
+ it 'should extract urls' do
23
+ Chipper.urls(@tweet).must_equal %w(http://t.co/IsSh1t12)
24
+ end
25
+
26
+ # NB, these should be only [a-zA-Z0-9]+ in the path in fact, not /w as that will include unicode chars
27
+ it 'should extract t.co urls cleanly w/quote' do
28
+ text = "Hello http://t.co/97CLxVkD\" http://t.co/12345678, http://t.co/xxxxxxxx. http://t.co/xxxx1234' damn!"
29
+ urls = %w(http://t.co/97CLxVkD http://t.co/12345678 http://t.co/xxxxxxxx http://t.co/xxxx1234)
30
+ Chipper.urls(text).must_equal(urls)
31
+ end
32
+
33
+ it 'should skip users' do
34
+ Chipper.skip_users(%w(youtube))
35
+ Chipper.users(@tweet).must_equal %w(@apple)
36
+ end
37
+
38
+ it 'should skip hashtags' do
39
+ Chipper.skip_hashtags(%w(cnn))
40
+ Chipper.hashtags(@tweet).must_equal %w(#removed)
41
+ end
42
+
43
+ it 'should return all entities using #entities method' do
44
+ expected = {}
45
+ expected.merge! users: %w(@youtube @apple)
46
+ expected.merge! hashtags: %w(#cnn #removed)
47
+ expected.merge! urls: %w(http://t.co/IsSh1t12)
48
+ expected.merge! tokens: [["why"], ["that", "stories", "abt"], ["always", "get"], ["from"]]
49
+
50
+ Chipper.entities(@tweet).must_equal expected
51
+ end
52
+
53
+ it 'should work around shitty urls' do
54
+ crap = "foo bar https://t.co/KCZSuVx½"
55
+ Chipper.urls(crap).must_equal ['https://t.co/KCZSuVx']
56
+ end
57
+ end
@@ -0,0 +1,118 @@
1
+ require 'helper'
2
+
3
+ describe 'Chipper tokens' do
4
+ before do
5
+ @tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://www1.youtube.com/videos/?"
6
+ end
7
+
8
+ after do
9
+ Chipper.skip_tokens nil
10
+ end
11
+
12
+ it 'should extract tokens' do
13
+ expected = [%w(why), %w(that stories abt), %w(always get), %w(from)]
14
+ Chipper.tokens(@tweet).must_equal expected
15
+ end
16
+
17
+ it 'should skip tokens' do
18
+ expected = [%w(why), %w(that), %w(abt), %w(get)]
19
+ Chipper.skip_tokens(%w(story always from video))
20
+ Chipper.tokens(@tweet).must_equal expected
21
+ end
22
+
23
+ it 'should skip numbers dates and times' do
24
+ expected = [["flubble","bubble"],["champion"]]
25
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
26
+ Chipper.tokens("flubble bubble I or 123454 123-123-1111 19 on te champion 19th at 4am or 12pm").must_equal expected
27
+ end
28
+
29
+ it 'should skip random other chars dates and times' do
30
+ expected = [["flubble","bubble"],["champion","winter"],["incredible"]]
31
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
32
+ Chipper.tokens("flubble_bubble ^o^ champion_winter oh_____ _______ ^incredible ").must_equal expected
33
+ end
34
+
35
+ it 'should be lowercase' do
36
+ expected = [["flubble","bubble"]]
37
+ Chipper.tokens("FLUBBLE BUBBLE").must_equal expected
38
+ end
39
+
40
+ it 'stop words should not be case sensitive' do
41
+ expected = [["pancakes"]]
42
+ Chipper.skip_tokens(%w(christmas))
43
+ Chipper.tokens("pancakes in Christmas").must_equal expected
44
+ end
45
+
46
+ it 'should filter stemmed words that are too short' do
47
+ expected = [["flubble","bubble"]]
48
+ Chipper.tokens("I am going to doing, being flubble bubble its").must_equal expected
49
+ end
50
+
51
+ # eg don't, won't aren't etc. Quick 'cheats' way to do this is just sub out all single quotes with nothing.
52
+ it 'should remove abbreviations with single quotes' do
53
+ expected = [["flubble","bubble","dont"]]
54
+ Chipper.tokens("flubble bubble don't do it").must_equal expected
55
+ end
56
+
57
+ it 'should segment across urls' do
58
+ expected = [%w(hello world), %w(this), %w(might work)]
59
+ Chipper.tokens('hello world, this http://www.example.com/1 might work').must_equal expected
60
+ end
61
+
62
+ describe 'segment across short, stop and non-words' do
63
+ before do
64
+ @expected = [%w(flopper bopper), %w(dopper)]
65
+ Chipper.skip_tokens(%w(four five six stopper))
66
+ end
67
+
68
+ after do
69
+ Chipper.skip_tokens nil
70
+ end
71
+
72
+ it 'should segment correctly on short word' do
73
+ Chipper.tokens('Flopper Bopper at Dopper').must_equal @expected
74
+ end
75
+
76
+ it 'should segment correctly on stop word' do
77
+ Chipper.tokens('Flopper Bopper STOPPER Dopper').must_equal @expected
78
+ end
79
+
80
+ it 'should segment correctly on non-word' do
81
+ Chipper.tokens('Flopper Bopper. Dopper').must_equal @expected
82
+ end
83
+
84
+ it 'should not get stuffs from users or hashtags' do
85
+ Chipper.tokens("melbourne @sydney_islame or #brisbane_humid").must_equal [["melbourne"]]
86
+ end
87
+
88
+ it 'should not add random newlines becasue there is underscores, full stop, space then url' do
89
+ Chipper.tokens("one__two.three http://t.co").must_equal [["one","two"],["three"]]
90
+ end
91
+
92
+ it 'should not add random newlines becasue there is leading underscore colon space then url' do
93
+ Chipper.tokens('bushes _purple: http://t.co/').must_equal [["bushes","purple"]]
94
+ end
95
+
96
+ it 'should group and segment across repeated or anchored underscores' do
97
+ Chipper.tokens('_foo bar._baz: hello__world! http://t.co/').must_equal [%w(foo bar), %w(baz), %w(hello world)]
98
+ Chipper.tokens('_foo bar._baz:_hello #hi,world').must_equal [%w(foo bar), %w(baz), %w(hello), %w(world)]
99
+ end
100
+ end
101
+
102
+ describe 'unicode' do
103
+ it 'should skip quotes and handle fullwidth @' do
104
+ text = "hello world, ain\u2019t this \uff20cool huh\u201d"
105
+ Chipper.tokens(text).must_equal [["hello", "world"], ["aint", "this"], ["huh"]]
106
+ end
107
+
108
+ it 'should kill tokens that start with unicode' do
109
+ text = "hello world, \u2020\uff26\u201f \u2021\uff36\u210fcool 12TH 123345134 awesome\u2020\uff26\u201f"
110
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$|^\W.*$}
111
+ Chipper.tokens(text).must_equal [["hello", "world"], ["awesome\u2020\uff26"]]
112
+ end
113
+
114
+ it 'should segment on unicode puncutation' do
115
+ Chipper.tokens("\u2039Hello\u203aWorld").must_equal [["hello"], ["world"]]
116
+ end
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,199 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chipper
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 2
9
+ version: 0.4.2
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ - John Barratt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-16 00:00:00 +11:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: twitter text extraction utilities
23
+ email:
24
+ - deepfryed@gmail.com
25
+ - djon00@gmail.com
26
+ executables: []
27
+
28
+ extensions:
29
+ - ext/extconf.rb
30
+ extra_rdoc_files:
31
+ - README.rdoc
32
+ files:
33
+ - ext/src/chipper.cc
34
+ - ext/re2/filtered_re2.cc
35
+ - ext/re2/unicode_casefold.cc
36
+ - ext/re2/prefilter.cc
37
+ - ext/re2/prefilter_tree.cc
38
+ - ext/re2/re2.cc
39
+ - ext/re2/valgrind.cc
40
+ - ext/re2/hash.cc
41
+ - ext/re2/parse.cc
42
+ - ext/re2/stringpiece.cc
43
+ - ext/re2/set.cc
44
+ - ext/re2/bitstate.cc
45
+ - ext/re2/prog.cc
46
+ - ext/re2/simplify.cc
47
+ - ext/re2/rune.cc
48
+ - ext/re2/dfa.cc
49
+ - ext/re2/onepass.cc
50
+ - ext/re2/unicode_groups.cc
51
+ - ext/re2/regexp.cc
52
+ - ext/re2/nfa.cc
53
+ - ext/re2/perl_groups.cc
54
+ - ext/re2/mimics_pcre.cc
55
+ - ext/re2/compile.cc
56
+ - ext/re2/tostring.cc
57
+ - ext/libstemmer_c/src_c/stem_UTF_8_russian.c
58
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c
59
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c
60
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
61
+ - ext/libstemmer_c/src_c/stem_UTF_8_danish.c
62
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
63
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
64
+ - ext/libstemmer_c/src_c/stem_UTF_8_swedish.c
65
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
66
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
67
+ - ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
68
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
69
+ - ext/libstemmer_c/src_c/stem_UTF_8_german.c
70
+ - ext/libstemmer_c/src_c/stem_UTF_8_romanian.c
71
+ - ext/libstemmer_c/src_c/stem_UTF_8_turkish.c
72
+ - ext/libstemmer_c/src_c/stem_UTF_8_italian.c
73
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
74
+ - ext/libstemmer_c/src_c/stem_KOI8_R_russian.c
75
+ - ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c
76
+ - ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c
77
+ - ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c
78
+ - ext/libstemmer_c/src_c/stem_UTF_8_french.c
79
+ - ext/libstemmer_c/src_c/stem_UTF_8_english.c
80
+ - ext/libstemmer_c/src_c/stem_UTF_8_finnish.c
81
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
82
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
83
+ - ext/libstemmer_c/src_c/stem_UTF_8_porter.c
84
+ - ext/libstemmer_c/src_c/stem_UTF_8_spanish.c
85
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
86
+ - ext/libstemmer_c/src_c/stem_UTF_8_dutch.c
87
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c
88
+ - ext/libstemmer_c/libstemmer/libstemmer.c
89
+ - ext/libstemmer_c/libstemmer/libstemmer_utf8.c
90
+ - ext/libstemmer_c/runtime/api.c
91
+ - ext/libstemmer_c/runtime/utilities.c
92
+ - ext/libstemmer_c/examples/stemwords.c
93
+ - ext/libstemmer_c/mkinc_utf8.mak
94
+ - ext/libstemmer_c/mkinc.mak
95
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
96
+ - ext/libstemmer_c/src_c/stem_UTF_8_porter.h
97
+ - ext/libstemmer_c/src_c/stem_UTF_8_danish.h
98
+ - ext/libstemmer_c/src_c/stem_UTF_8_english.h
99
+ - ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h
100
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
101
+ - ext/libstemmer_c/src_c/stem_UTF_8_romanian.h
102
+ - ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h
103
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
104
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h
105
+ - ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h
106
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h
107
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
108
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
109
+ - ext/libstemmer_c/src_c/stem_UTF_8_finnish.h
110
+ - ext/libstemmer_c/src_c/stem_UTF_8_italian.h
111
+ - ext/libstemmer_c/src_c/stem_UTF_8_russian.h
112
+ - ext/libstemmer_c/src_c/stem_UTF_8_spanish.h
113
+ - ext/libstemmer_c/src_c/stem_UTF_8_swedish.h
114
+ - ext/libstemmer_c/src_c/stem_UTF_8_french.h
115
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h
116
+ - ext/libstemmer_c/src_c/stem_UTF_8_turkish.h
117
+ - ext/libstemmer_c/src_c/stem_KOI8_R_russian.h
118
+ - ext/libstemmer_c/src_c/stem_UTF_8_german.h
119
+ - ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
120
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
121
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
122
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
123
+ - ext/libstemmer_c/src_c/stem_UTF_8_dutch.h
124
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
125
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
126
+ - ext/libstemmer_c/include/libstemmer.h
127
+ - ext/libstemmer_c/libstemmer/modules_utf8.h
128
+ - ext/libstemmer_c/libstemmer/modules.h
129
+ - ext/libstemmer_c/runtime/header.h
130
+ - ext/libstemmer_c/runtime/api.h
131
+ - ext/src/version.h
132
+ - ext/util/benchmark.h
133
+ - ext/util/arena.h
134
+ - ext/util/thread.h
135
+ - ext/util/mutex.h
136
+ - ext/util/atomicops.h
137
+ - ext/util/valgrind.h
138
+ - ext/util/test.h
139
+ - ext/util/random.h
140
+ - ext/util/sparse_set.h
141
+ - ext/util/utf.h
142
+ - ext/util/pcre.h
143
+ - ext/util/flags.h
144
+ - ext/util/util.h
145
+ - ext/util/sparse_array.h
146
+ - ext/util/logging.h
147
+ - ext/re2/stringpiece.h
148
+ - ext/re2/prefilter_tree.h
149
+ - ext/re2/prefilter.h
150
+ - ext/re2/walker-inl.h
151
+ - ext/re2/variadic_function.h
152
+ - ext/re2/set.h
153
+ - ext/re2/re2.h
154
+ - ext/re2/regexp.h
155
+ - ext/re2/filtered_re2.h
156
+ - ext/re2/unicode_casefold.h
157
+ - ext/re2/prog.h
158
+ - ext/re2/unicode_groups.h
159
+ - ext/extconf.rb
160
+ - ext/stemmer.rb
161
+ - test/test_tokens.rb
162
+ - test/helper.rb
163
+ - test/test_entities.rb
164
+ - ext/libstemmer_c/Makefile
165
+ - README.rdoc
166
+ has_rdoc: true
167
+ homepage: http://github.com/deepfryed/chipper
168
+ licenses: []
169
+
170
+ post_install_message:
171
+ rdoc_options: []
172
+
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ none: false
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ segments:
181
+ - 0
182
+ version: "0"
183
+ required_rubygems_version: !ruby/object:Gem::Requirement
184
+ none: false
185
+ requirements:
186
+ - - ">="
187
+ - !ruby/object:Gem::Version
188
+ segments:
189
+ - 0
190
+ version: "0"
191
+ requirements: []
192
+
193
+ rubyforge_project:
194
+ rubygems_version: 1.3.7
195
+ signing_key:
196
+ specification_version: 3
197
+ summary: twitter text extractor
198
+ test_files: []
199
+