chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ $:.unshift File.dirname(__FILE__) + '/../ext'
2
+ require 'chipper'
3
+ require 'minitest/spec'
4
+
5
+ MiniTest::Unit.autorun
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ describe 'Chipper entities' do
5
+ before do
6
+ @tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://t.co/IsSh1t12"
7
+ end
8
+
9
+ after do
10
+ Chipper.skip_users nil
11
+ Chipper.skip_hashtags nil
12
+ end
13
+
14
+ it 'should extract users' do
15
+ Chipper.users(@tweet).must_equal %w(@youtube @apple)
16
+ end
17
+
18
+ it 'should extract hashtags' do
19
+ Chipper.hashtags(@tweet).must_equal %w(#cnn #removed)
20
+ end
21
+
22
+ it 'should extract urls' do
23
+ Chipper.urls(@tweet).must_equal %w(http://t.co/IsSh1t12)
24
+ end
25
+
26
+ # NB, these should be only [a-zA-Z0-9]+ in the path in fact, not /w as that will include unicode chars
27
+ it 'should extract t.co urls cleanly w/quote' do
28
+ text = "Hello http://t.co/97CLxVkD\" http://t.co/12345678, http://t.co/xxxxxxxx. http://t.co/xxxx1234' damn!"
29
+ urls = %w(http://t.co/97CLxVkD http://t.co/12345678 http://t.co/xxxxxxxx http://t.co/xxxx1234)
30
+ Chipper.urls(text).must_equal(urls)
31
+ end
32
+
33
+ it 'should skip users' do
34
+ Chipper.skip_users(%w(youtube))
35
+ Chipper.users(@tweet).must_equal %w(@apple)
36
+ end
37
+
38
+ it 'should skip hashtags' do
39
+ Chipper.skip_hashtags(%w(cnn))
40
+ Chipper.hashtags(@tweet).must_equal %w(#removed)
41
+ end
42
+
43
+ it 'should return all entities using #entities method' do
44
+ expected = {}
45
+ expected.merge! users: %w(@youtube @apple)
46
+ expected.merge! hashtags: %w(#cnn #removed)
47
+ expected.merge! urls: %w(http://t.co/IsSh1t12)
48
+ expected.merge! tokens: [["why"], ["that", "stories", "abt"], ["always", "get"], ["from"]]
49
+
50
+ Chipper.entities(@tweet).must_equal expected
51
+ end
52
+
53
+ it 'should work around shitty urls' do
54
+ crap = "foo bar https://t.co/KCZSuVx½"
55
+ Chipper.urls(crap).must_equal ['https://t.co/KCZSuVx']
56
+ end
57
+ end
@@ -0,0 +1,118 @@
1
+ require 'helper'
2
+
3
+ describe 'Chipper tokens' do
4
+ before do
5
+ @tweet = "@youtube, why is that stories abt @apple on #cnn always get #removed from http://www1.youtube.com/videos/?"
6
+ end
7
+
8
+ after do
9
+ Chipper.skip_tokens nil
10
+ end
11
+
12
+ it 'should extract tokens' do
13
+ expected = [%w(why), %w(that stories abt), %w(always get), %w(from)]
14
+ Chipper.tokens(@tweet).must_equal expected
15
+ end
16
+
17
+ it 'should skip tokens' do
18
+ expected = [%w(why), %w(that), %w(abt), %w(get)]
19
+ Chipper.skip_tokens(%w(story always from video))
20
+ Chipper.tokens(@tweet).must_equal expected
21
+ end
22
+
23
+ it 'should skip numbers dates and times' do
24
+ expected = [["flubble","bubble"],["champion"]]
25
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
26
+ Chipper.tokens("flubble bubble I or 123454 123-123-1111 19 on te champion 19th at 4am or 12pm").must_equal expected
27
+ end
28
+
29
+ it 'should skip random other chars dates and times' do
30
+ expected = [["flubble","bubble"],["champion","winter"],["incredible"]]
31
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$}
32
+ Chipper.tokens("flubble_bubble ^o^ champion_winter oh_____ _______ ^incredible ").must_equal expected
33
+ end
34
+
35
+ it 'should be lowercase' do
36
+ expected = [["flubble","bubble"]]
37
+ Chipper.tokens("FLUBBLE BUBBLE").must_equal expected
38
+ end
39
+
40
+ it 'stop words should not be case sensitive' do
41
+ expected = [["pancakes"]]
42
+ Chipper.skip_tokens(%w(christmas))
43
+ Chipper.tokens("pancakes in Christmas").must_equal expected
44
+ end
45
+
46
+ it 'should filter stemmed words that are too short' do
47
+ expected = [["flubble","bubble"]]
48
+ Chipper.tokens("I am going to doing, being flubble bubble its").must_equal expected
49
+ end
50
+
51
+ # eg don't, won't aren't etc. Quick 'cheats' way to do this is just sub out all single quotes with nothing.
52
+ it 'should remove abbreviations with single quotes' do
53
+ expected = [["flubble","bubble","dont"]]
54
+ Chipper.tokens("flubble bubble don't do it").must_equal expected
55
+ end
56
+
57
+ it 'should segment across urls' do
58
+ expected = [%w(hello world), %w(this), %w(might work)]
59
+ Chipper.tokens('hello world, this http://www.example.com/1 might work').must_equal expected
60
+ end
61
+
62
+ describe 'segment across short, stop and non-words' do
63
+ before do
64
+ @expected = [%w(flopper bopper), %w(dopper)]
65
+ Chipper.skip_tokens(%w(four five six stopper))
66
+ end
67
+
68
+ after do
69
+ Chipper.skip_tokens nil
70
+ end
71
+
72
+ it 'should segment correctly on short word' do
73
+ Chipper.tokens('Flopper Bopper at Dopper').must_equal @expected
74
+ end
75
+
76
+ it 'should segment correctly on stop word' do
77
+ Chipper.tokens('Flopper Bopper STOPPER Dopper').must_equal @expected
78
+ end
79
+
80
+ it 'should segment correctly on non-word' do
81
+ Chipper.tokens('Flopper Bopper. Dopper').must_equal @expected
82
+ end
83
+
84
+ it 'should not get stuffs from users or hashtags' do
85
+ Chipper.tokens("melbourne @sydney_islame or #brisbane_humid").must_equal [["melbourne"]]
86
+ end
87
+
88
+ it 'should not add random newlines becasue there is underscores, full stop, space then url' do
89
+ Chipper.tokens("one__two.three http://t.co").must_equal [["one","two"],["three"]]
90
+ end
91
+
92
+ it 'should not add random newlines becasue there is leading underscore colon space then url' do
93
+ Chipper.tokens('bushes _purple: http://t.co/').must_equal [["bushes","purple"]]
94
+ end
95
+
96
+ it 'should group and segment across repeated or anchored underscores' do
97
+ Chipper.tokens('_foo bar._baz: hello__world! http://t.co/').must_equal [%w(foo bar), %w(baz), %w(hello world)]
98
+ Chipper.tokens('_foo bar._baz:_hello #hi,world').must_equal [%w(foo bar), %w(baz), %w(hello), %w(world)]
99
+ end
100
+ end
101
+
102
+ describe 'unicode' do
103
+ it 'should skip quotes and handle fullwidth @' do
104
+ text = "hello world, ain\u2019t this \uff20cool huh\u201d"
105
+ Chipper.tokens(text).must_equal [["hello", "world"], ["aint", "this"], ["huh"]]
106
+ end
107
+
108
+ it 'should kill tokens that start with unicode' do
109
+ text = "hello world, \u2020\uff26\u201f \u2021\uff36\u210fcool 12TH 123345134 awesome\u2020\uff26\u201f"
110
+ Chipper.skip_token_pattern %q{^(?:[\d\-]+)(?:am|pm|th|st|rd)?$|^\W.*$}
111
+ Chipper.tokens(text).must_equal [["hello", "world"], ["awesome\u2020\uff26"]]
112
+ end
113
+
114
+ it 'should segment on unicode puncutation' do
115
+ Chipper.tokens("\u2039Hello\u203aWorld").must_equal [["hello"], ["world"]]
116
+ end
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,199 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chipper
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 2
9
+ version: 0.4.2
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ - John Barratt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-16 00:00:00 +11:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: twitter text extraction utilities
23
+ email:
24
+ - deepfryed@gmail.com
25
+ - djon00@gmail.com
26
+ executables: []
27
+
28
+ extensions:
29
+ - ext/extconf.rb
30
+ extra_rdoc_files:
31
+ - README.rdoc
32
+ files:
33
+ - ext/src/chipper.cc
34
+ - ext/re2/filtered_re2.cc
35
+ - ext/re2/unicode_casefold.cc
36
+ - ext/re2/prefilter.cc
37
+ - ext/re2/prefilter_tree.cc
38
+ - ext/re2/re2.cc
39
+ - ext/re2/valgrind.cc
40
+ - ext/re2/hash.cc
41
+ - ext/re2/parse.cc
42
+ - ext/re2/stringpiece.cc
43
+ - ext/re2/set.cc
44
+ - ext/re2/bitstate.cc
45
+ - ext/re2/prog.cc
46
+ - ext/re2/simplify.cc
47
+ - ext/re2/rune.cc
48
+ - ext/re2/dfa.cc
49
+ - ext/re2/onepass.cc
50
+ - ext/re2/unicode_groups.cc
51
+ - ext/re2/regexp.cc
52
+ - ext/re2/nfa.cc
53
+ - ext/re2/perl_groups.cc
54
+ - ext/re2/mimics_pcre.cc
55
+ - ext/re2/compile.cc
56
+ - ext/re2/tostring.cc
57
+ - ext/libstemmer_c/src_c/stem_UTF_8_russian.c
58
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c
59
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c
60
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
61
+ - ext/libstemmer_c/src_c/stem_UTF_8_danish.c
62
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
63
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
64
+ - ext/libstemmer_c/src_c/stem_UTF_8_swedish.c
65
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
66
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
67
+ - ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
68
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
69
+ - ext/libstemmer_c/src_c/stem_UTF_8_german.c
70
+ - ext/libstemmer_c/src_c/stem_UTF_8_romanian.c
71
+ - ext/libstemmer_c/src_c/stem_UTF_8_turkish.c
72
+ - ext/libstemmer_c/src_c/stem_UTF_8_italian.c
73
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
74
+ - ext/libstemmer_c/src_c/stem_KOI8_R_russian.c
75
+ - ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c
76
+ - ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c
77
+ - ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c
78
+ - ext/libstemmer_c/src_c/stem_UTF_8_french.c
79
+ - ext/libstemmer_c/src_c/stem_UTF_8_english.c
80
+ - ext/libstemmer_c/src_c/stem_UTF_8_finnish.c
81
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
82
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
83
+ - ext/libstemmer_c/src_c/stem_UTF_8_porter.c
84
+ - ext/libstemmer_c/src_c/stem_UTF_8_spanish.c
85
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
86
+ - ext/libstemmer_c/src_c/stem_UTF_8_dutch.c
87
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c
88
+ - ext/libstemmer_c/libstemmer/libstemmer.c
89
+ - ext/libstemmer_c/libstemmer/libstemmer_utf8.c
90
+ - ext/libstemmer_c/runtime/api.c
91
+ - ext/libstemmer_c/runtime/utilities.c
92
+ - ext/libstemmer_c/examples/stemwords.c
93
+ - ext/libstemmer_c/mkinc_utf8.mak
94
+ - ext/libstemmer_c/mkinc.mak
95
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
96
+ - ext/libstemmer_c/src_c/stem_UTF_8_porter.h
97
+ - ext/libstemmer_c/src_c/stem_UTF_8_danish.h
98
+ - ext/libstemmer_c/src_c/stem_UTF_8_english.h
99
+ - ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h
100
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
101
+ - ext/libstemmer_c/src_c/stem_UTF_8_romanian.h
102
+ - ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h
103
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
104
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h
105
+ - ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h
106
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h
107
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
108
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
109
+ - ext/libstemmer_c/src_c/stem_UTF_8_finnish.h
110
+ - ext/libstemmer_c/src_c/stem_UTF_8_italian.h
111
+ - ext/libstemmer_c/src_c/stem_UTF_8_russian.h
112
+ - ext/libstemmer_c/src_c/stem_UTF_8_spanish.h
113
+ - ext/libstemmer_c/src_c/stem_UTF_8_swedish.h
114
+ - ext/libstemmer_c/src_c/stem_UTF_8_french.h
115
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h
116
+ - ext/libstemmer_c/src_c/stem_UTF_8_turkish.h
117
+ - ext/libstemmer_c/src_c/stem_KOI8_R_russian.h
118
+ - ext/libstemmer_c/src_c/stem_UTF_8_german.h
119
+ - ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
120
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
121
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
122
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
123
+ - ext/libstemmer_c/src_c/stem_UTF_8_dutch.h
124
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
125
+ - ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
126
+ - ext/libstemmer_c/include/libstemmer.h
127
+ - ext/libstemmer_c/libstemmer/modules_utf8.h
128
+ - ext/libstemmer_c/libstemmer/modules.h
129
+ - ext/libstemmer_c/runtime/header.h
130
+ - ext/libstemmer_c/runtime/api.h
131
+ - ext/src/version.h
132
+ - ext/util/benchmark.h
133
+ - ext/util/arena.h
134
+ - ext/util/thread.h
135
+ - ext/util/mutex.h
136
+ - ext/util/atomicops.h
137
+ - ext/util/valgrind.h
138
+ - ext/util/test.h
139
+ - ext/util/random.h
140
+ - ext/util/sparse_set.h
141
+ - ext/util/utf.h
142
+ - ext/util/pcre.h
143
+ - ext/util/flags.h
144
+ - ext/util/util.h
145
+ - ext/util/sparse_array.h
146
+ - ext/util/logging.h
147
+ - ext/re2/stringpiece.h
148
+ - ext/re2/prefilter_tree.h
149
+ - ext/re2/prefilter.h
150
+ - ext/re2/walker-inl.h
151
+ - ext/re2/variadic_function.h
152
+ - ext/re2/set.h
153
+ - ext/re2/re2.h
154
+ - ext/re2/regexp.h
155
+ - ext/re2/filtered_re2.h
156
+ - ext/re2/unicode_casefold.h
157
+ - ext/re2/prog.h
158
+ - ext/re2/unicode_groups.h
159
+ - ext/extconf.rb
160
+ - ext/stemmer.rb
161
+ - test/test_tokens.rb
162
+ - test/helper.rb
163
+ - test/test_entities.rb
164
+ - ext/libstemmer_c/Makefile
165
+ - README.rdoc
166
+ has_rdoc: true
167
+ homepage: http://github.com/deepfryed/chipper
168
+ licenses: []
169
+
170
+ post_install_message:
171
+ rdoc_options: []
172
+
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ none: false
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ segments:
181
+ - 0
182
+ version: "0"
183
+ required_rubygems_version: !ruby/object:Gem::Requirement
184
+ none: false
185
+ requirements:
186
+ - - ">="
187
+ - !ruby/object:Gem::Version
188
+ segments:
189
+ - 0
190
+ version: "0"
191
+ requirements: []
192
+
193
+ rubyforge_project:
194
+ rubygems_version: 1.3.7
195
+ signing_key:
196
+ specification_version: 3
197
+ summary: twitter text extractor
198
+ test_files: []
199
+