summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,172 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="estonian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>aga</word>
62
+ <word>ainult</word>
63
+ <word>alates</word>
64
+ <word>alati</word>
65
+ <word>all</word>
66
+ <word>ees</word>
67
+ <word>ei</word>
68
+ <word>esimene</word>
69
+ <word>et</word>
70
+ <word>hoolimata</word>
71
+ <word>iga</word>
72
+ <word>ilma</word>
73
+ <word>ja</word>
74
+ <word>jah</word>
75
+ <word>jaoks</word>
76
+ <word>jooksul</word>
77
+ <word>juures</word>
78
+ <word>ka</word>
79
+ <word>kaheksa</word>
80
+ <word>kaheksakümmend</word>
81
+ <word>kaks</word>
82
+ <word>kakskümmend</word>
83
+ <word>kelle</word>
84
+ <word>kes</word>
85
+ <word>kolm</word>
86
+ <word>kolmkümmend</word>
87
+ <word>koos</word>
88
+ <word>kui</word>
89
+ <word>kümme</word>
90
+ <word>kuni</word>
91
+ <word>kus</word>
92
+ <word>kuus</word>
93
+ <word>kuuskümmend</word>
94
+ <word>läbi</word>
95
+ <word>lähed</word>
96
+ <word>lähema</word>
97
+ <word>lähen</word>
98
+ <word>läks</word>
99
+ <word>läksid</word>
100
+ <word>läksime</word>
101
+ <word>läksite</word>
102
+ <word>ma</word>
103
+ <word>me</word>
104
+ <word>meie</word>
105
+ <word>miks</word>
106
+ <word>miljon</word>
107
+ <word>mina</word>
108
+ <word>mind</word>
109
+ <word>minema</word>
110
+ <word>mis</word>
111
+ <word>missugune</word>
112
+ <word>mõnikord</word>
113
+ <word>mulle</word>
114
+ <word>neli</word>
115
+ <word>nelikümmend</word>
116
+ <word>nende</word>
117
+ <word>ning</word>
118
+ <word>null</word>
119
+ <word>nüüd</word>
120
+ <word>oled</word>
121
+ <word>olema</word>
122
+ <word>oleme</word>
123
+ <word>olen</word>
124
+ <word>oli</word>
125
+ <word>olnud</word>
126
+ <word>omama</word>
127
+ <word>on</word>
128
+ <word>palju</word>
129
+ <word>peal</word>
130
+ <word>piki</word>
131
+ <word>sa</word>
132
+ <word>sada</word>
133
+ <word>seal</word>
134
+ <word>see</word>
135
+ <word>sees</word>
136
+ <word>seest</word>
137
+ <word>seitse</word>
138
+ <word>seitsekümmend</word>
139
+ <word>sest</word>
140
+ <word>siin</word>
141
+ <word>siis</word>
142
+ <word>sina</word>
143
+ <word>sind</word>
144
+ <word>sulle</word>
145
+ <word>ta</word>
146
+ <word>tagasi</word>
147
+ <word>talle</word>
148
+ <word>te</word>
149
+ <word>teeb</word>
150
+ <word>teed</word>
151
+ <word>teen</word>
152
+ <word>tegema</word>
153
+ <word>tegi</word>
154
+ <word>tehtud</word>
155
+ <word>teie</word>
156
+ <word>tema</word>
157
+ <word>tuhat</word>
158
+ <word>üheksa</word>
159
+ <word>üheksakümmend</word>
160
+ <word>ükskord</word>
161
+ <word>üle</word>
162
+ <word>ümber</word>
163
+ <word>umbes</word>
164
+ <word>vahel</word>
165
+ <word>vastas</word>
166
+ <word>vastu</word>
167
+ <word>veel</word>
168
+ <word>viis</word>
169
+ <word>viiskümmend</word>
170
+ <word>või</word>
171
+ </grader-tc>
172
+ </dictionary>
@@ -0,0 +1,77 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="basque">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>bai</word>
62
+ <word>baita</word>
63
+ <word>bere</word>
64
+ <word>edo</word>
65
+ <word>egon</word>
66
+ <word>ere</word>
67
+ <word>eta</word>
68
+ <word>ez</word>
69
+ <word>gabe</word>
70
+ <word>hau</word>
71
+ <word>hori</word>
72
+ <word>hura</word>
73
+ <word>inor</word>
74
+ <word>izan</word>
75
+ <word>kaixo</word>
76
+ </grader-tc>
77
+ </dictionary>
@@ -0,0 +1,105 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="finnish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>ehkä</word>
62
+ <word>enemmän</word>
63
+ <word>että</word>
64
+ <word>he</word>
65
+ <word>hei</word>
66
+ <word>hän</word>
67
+ <word>ja</word>
68
+ <word>jahka</word>
69
+ <word>joo</word>
70
+ <word>joskus</word>
71
+ <word>jotta</word>
72
+ <word>kaikki</word>
73
+ <word>kuinka</word>
74
+ <word>kun</word>
75
+ <word>me</word>
76
+ <word>mikä</word>
77
+ <word>minä</word>
78
+ <word>miten</word>
79
+ <word>mutta</word>
80
+ <word>myös</word>
81
+ <word>ne</word>
82
+ <word>no</word>
83
+ <word>nyt</word>
84
+ <word>olen</word>
85
+ <word>paitsi</word>
86
+ <word>sekä</word>
87
+ <word>siis</word>
88
+ <word>sillä</word>
89
+ <word>sinä</word>
90
+ <word>tahi</word>
91
+ <word>tahikka</word>
92
+ <word>tai</word>
93
+ <word>taikka</word>
94
+ <word>te</word>
95
+ <word>tällä</word>
96
+ <word>tämä</word>
97
+ <word>tässä</word>
98
+ <word>vaan</word>
99
+ <word>vai</word>
100
+ <word>vain</word>
101
+ <word>vasta</word>
102
+ <word>vielä</word>
103
+ <word>yli</word>
104
+ </grader-tc>
105
+ </dictionary>
@@ -0,0 +1,199 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="french">
3
+ <stemmer>
4
+
5
+
6
+ <step1_pre>
7
+ <rule>"|</rule>
8
+ <rule>(|</rule>
9
+ </step1_pre>
10
+
11
+
12
+ <step1_post>
13
+ <rule>."|</rule>
14
+ <rule>,"|</rule>
15
+ <rule>.|</rule>
16
+ <rule>,|</rule>
17
+ <rule>"|</rule>
18
+ <rule>)|</rule>
19
+ <rule>?|</rule>
20
+ <rule>:|</rule>
21
+ <rule>;|</rule>
22
+ <rule>!|</rule>
23
+ </step1_post>
24
+
25
+
26
+ <manual>
27
+ <rule>wrote|write</rule>
28
+ <rule>came|come</rule>
29
+ <rule>went|go</rule>
30
+ </manual>
31
+
32
+ <post>
33
+ <rule>before1|1after</rule>
34
+ </post>
35
+ <pre>
36
+ <rule>before1|1after</rule>
37
+ </pre>
38
+
39
+
40
+ <manual>
41
+ <rule>wrote|write</rule>
42
+ </manual>
43
+
44
+ <synonyms>
45
+ <rule>assist|help</rule>
46
+ </synonyms>
47
+
48
+ </stemmer>
49
+ <parser>
50
+ <linebreak>
51
+ <rule>."</rule>
52
+ <rule>?"</rule>
53
+ <rule>!"</rule>
54
+ <rule>,"</rule>
55
+ <rule>.</rule>
56
+ <rule>?</rule>
57
+ <rule>;</rule>
58
+ <rule>|</rule>
59
+ <rule>!</rule>
60
+ </linebreak>
61
+
62
+ <linedontbreak>
63
+ <rule>Dr.</rule>
64
+ <rule>Mr.</rule>
65
+ <rule>Mrs.</rule>
66
+ <rule>U.S.</rule>
67
+ <rule>Rep.</rule>
68
+ <rule>Sen.</rule>
69
+ </linedontbreak>
70
+ </parser>
71
+ <grader-tc>
72
+ <word>le</word>
73
+ <word>la</word>
74
+ <word>les</word>
75
+ <word>un</word>
76
+ <word>une</word>
77
+ <word>je</word>
78
+ <word>me</word>
79
+ <word>moi</word>
80
+ <word>mon</word>
81
+ <word>ma</word>
82
+ <word>mes</word>
83
+ <word>nous</word>
84
+ <word>notre</word>
85
+ <word>nos</word>
86
+ <word>tu</word>
87
+ <word>te</word>
88
+ <word>ton</word>
89
+ <word>ta</word>
90
+ <word>tes</word>
91
+ <word>vous</word>
92
+ <word>votre</word>
93
+ <word>vos</word>
94
+ <word>il</word>
95
+ <word>lui</word>
96
+ <word>son</word>
97
+ <word>sa</word>
98
+ <word>ses</word>
99
+ <word>ils</word>
100
+ <word>leur</word>
101
+ <word>leurs</word>
102
+ <word>elle</word>
103
+ <word>elles</word>
104
+ <word>on</word>
105
+ <word>où</word>
106
+ <word>quand</word>
107
+ <word></word>
108
+ <word>à</word>
109
+ <word>aussi</word>
110
+ <word>autre</word>
111
+ <word>avec</word>
112
+ <word>ça</word>
113
+ <word>ce</word>
114
+ <word>cet</word>
115
+ <word>cette</word>
116
+ <word>ces</word>
117
+ <word>ceci</word>
118
+ <word>cela</word>
119
+ <word>chaque</word>
120
+ <word>ci</word>
121
+ <word>dans</word>
122
+ <word>de</word>
123
+ <word>en</word>
124
+ <word>et</word>
125
+ <word>entre</word>
126
+ <word>ici</word>
127
+ <word>jamais</word>
128
+ <word>là</word>
129
+ <word>mais</word>
130
+ <word>même</word>
131
+ <word>moins</word>
132
+ <word>ne</word>
133
+ <word>ou</word>
134
+ <word>par</word>
135
+ <word>parfois</word>
136
+ <word>pas</word>
137
+ <word>pendant</word>
138
+ <word>plus</word>
139
+ <word>pour</word>
140
+ <word>pourtant</word>
141
+ <word>que</word>
142
+ <word>quelque</word>
143
+ <word>qui</word>
144
+ <word>quois</word>
145
+ <word>rien</word>
146
+ <word>sans</word>
147
+ <word>si</word>
148
+ <word>sur</word>
149
+ <word>tellement</word>
150
+ <word>très</word>
151
+ <word>trop</word>
152
+ <word>y</word>
153
+ <word>des</word>
154
+ <word>du</word>
155
+ <word>n'est</word>
156
+ <word>être</word>
157
+ <word>suis</word>
158
+ <word>es</word>
159
+ <word>est</word>
160
+ <word>sommes</word>
161
+ <word>êtes</word>
162
+ <word>sont</word>
163
+ <word>étais</word>
164
+ <word>était</word>
165
+ <word>étions</word>
166
+ <word>étiez</word>
167
+ <word>étaient</word>
168
+ <word>été</word>
169
+ <word>avoir</word>
170
+ <word>ai</word>
171
+ <word>as</word>
172
+ <word>a</word>
173
+ <word>avons</word>
174
+ <word>avez</word>
175
+ <word>ont</word>
176
+ <word>avais</word>
177
+ <word>avait</word>
178
+ <word>avions</word>
179
+ <word>aviez</word>
180
+ <word>avaient</word>
181
+ <word>aie</word>
182
+ <word>aies</word>
183
+ <word>ait</word>
184
+ <word>ayons</word>
185
+ <word>ayez</word>
186
+ <word>aient</word>
187
+ <word>eu</word>
188
+ <word>devoir</word>
189
+ <word>dû</word>
190
+ <word>faire</word>
191
+ <word>fait</word>
192
+ <word>pouvoir</word>
193
+ <word>pu</word>
194
+ <word>vouloir</word>
195
+ <word>voulu</word>
196
+ <word>aller</word>
197
+ <word></word>
198
+ </grader-tc>
199
+ </dictionary>