ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,80 @@
1
+ # OTS
2
+
3
+ ots is an interface to libots - The open text summarizer
4
+
5
+ ## Dependencies
6
+
7
+ * ruby 1.9.1 or later
8
+ * libxml2
9
+ * glib2.0
10
+ * homebrew (on MacOSX)
11
+
12
+ ## Installation
13
+
14
+ ### Debian flavors of Linux
15
+
16
+ ```
17
+
18
+ # ruby & ruby development libraries (not needed if you use rvm)
19
+ sudo apt-get install ruby1.9.1-dev ruby1.9.1
20
+
21
+ # libxml2 and glib development libraries
22
+ sudo apt-get install libxml2-dev libglib2.0-dev
23
+
24
+ # install ots
25
+ gem install ots
26
+
27
+ ```
28
+
29
+ ### MacOSX
30
+
31
+
32
+ ```
33
+
34
+ # update homebrew to latest & greatest version
35
+ GIT_SSL_NO_VERIFY=1 brew update
36
+
37
+ # install glib
38
+ brew install glib
39
+
40
+ # install ots
41
+ gem install ots
42
+
43
+ ```
44
+
45
+ ## API
46
+
47
+ ```
48
+ OTS
49
+ .parse #=> OTS::Article
50
+ .dictionaries #=> Array
51
+
52
+ OTS::Article
53
+ .new
54
+ #summarize #=> Array
55
+ #keywords #=> Array
56
+ #title #=> String
57
+
58
+ ```
59
+
60
+ ## Usage
61
+
62
+ ```ruby
63
+ require 'ots'
64
+ article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
65
+
66
+ article.keywords
67
+ article.summarize(lines: 1)
68
+ article.summarize(percent: 50)
69
+
70
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
71
+ article.keywords
72
+ article.summarize(lines: 1)
73
+ article.summarize(percent: 50)
74
+
75
+ OTS.dictionaries #=> list of supported dictionaries
76
+ ```
77
+
78
+ ## License
79
+
80
+ [Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
@@ -0,0 +1,101 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="bulgarian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>август</word>
62
+ <word>април</word>
63
+ <word>в</word>
64
+ <word>всеки</word>
65
+ <word>всичко</word>
66
+ <word>вторник</word>
67
+ <word>да</word>
68
+ <word>декември</word>
69
+ <word>за</word>
70
+ <word>и</word>
71
+ <word>или</word>
72
+ <word>има</word>
73
+ <word>което</word>
74
+ <word>към</word>
75
+ <word>май</word>
76
+ <word>март</word>
77
+ <word>на</word>
78
+ <word>не</word>
79
+ <word>неделя</word>
80
+ <word>ноември</word>
81
+ <word>октомври</word>
82
+ <word>от</word>
83
+ <word>петък</word>
84
+ <word>по</word>
85
+ <word>понеделник</word>
86
+ <word>при</word>
87
+ <word>с</word>
88
+ <word>септември</word>
89
+ <word>сряда</word>
90
+ <word>сто</word>
91
+ <word>събота</word>
92
+ <word>трябва</word>
93
+ <word>февруари</word>
94
+ <word>хиляда</word>
95
+ <word>че</word>
96
+ <word>четвъртък</word>
97
+ <word>юли</word>
98
+ <word>юни</word>
99
+ <word>януари</word>
100
+ </grader-tc>
101
+ </dictionary>
@@ -0,0 +1,141 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="catalan">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>abans</word>
63
+ <word>al</word>
64
+ <word>amb</word>
65
+ <word>ambdós</word>
66
+ <word>anar</word>
67
+ <word>ara</word>
68
+ <word>baix</word>
69
+ <word>cap</word>
70
+ <word>cert</word>
71
+ <word>com</word>
72
+ <word>cuál</word>
73
+ <word>damunt</word>
74
+ <word>de</word>
75
+ <word>dins</word>
76
+ <word>doble</word>
77
+ <word>dos</word>
78
+ <word>dues</word>
79
+ <word>el</word>
80
+ <word>ell</word>
81
+ <word>ella</word>
82
+ <word>elles</word>
83
+ <word>ells</word>
84
+ <word>els</word>
85
+ <word>en</word>
86
+ <word>ésser</word>
87
+ <word>estar</word>
88
+ <word>excepte</word>
89
+ <word>jo</word>
90
+ <word>la</word>
91
+ <word>les</word>
92
+ <word>lluny</word>
93
+ <word>lo</word>
94
+ <word>los</word>
95
+ <word>mai</word>
96
+ <word>me</word>
97
+ <word>meu</word>
98
+ <word>meus</word>
99
+ <word>meva</word>
100
+ <word>meves</word>
101
+ <word>mí</word>
102
+ <word>na</word>
103
+ <word>nos</word>
104
+ <word>nosaltres</word>
105
+ <word>nostra</word>
106
+ <word>nostre</word>
107
+ <word>nostres</word>
108
+ <word>qual</word>
109
+ <word>quals</word>
110
+ <word>quan</word>
111
+ <word>quelcom</word>
112
+ <word>quin</word>
113
+ <word>quina</word>
114
+ <word>quines</word>
115
+ <word>quins</word>
116
+ <word>se</word>
117
+ <word>ser</word>
118
+ <word>seu</word>
119
+ <word>seus</word>
120
+ <word>seva</word>
121
+ <word>seves</word>
122
+ <word>sí</word>
123
+ <word>tenir</word>
124
+ <word>teu</word>
125
+ <word>teus</word>
126
+ <word>teva</word>
127
+ <word>teves</word>
128
+ <word>tu</word>
129
+ <word>u</word>
130
+ <word>un</word>
131
+ <word>una</word>
132
+ <word>unes</word>
133
+ <word>uns</word>
134
+ <word>vosaltres</word>
135
+ <word>vostè</word>
136
+ <word>vostès</word>
137
+ <word>vostra</word>
138
+ <word>vostre</word>
139
+ <word>vostres</word>
140
+ </grader-tc>
141
+ </dictionary>
@@ -0,0 +1,161 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="czech">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>aby</word>
63
+ <word>ale</word>
64
+ <word>ani</word>
65
+ <word>ano</word>
66
+ <word>až</word>
67
+ <word>být</word>
68
+ <word>co</word>
69
+ <word>dělat</word>
70
+ <word>dnes</word>
71
+ <word>do</word>
72
+ <word>doma</word>
73
+ <word>domů</word>
74
+ <word>i</word>
75
+ <word>já</word>
76
+ <word>jak</word>
77
+ <word>jako</word>
78
+ <word>je</word>
79
+ <word>jen</word>
80
+ <word>jenom</word>
81
+ <word>ještě</word>
82
+ <word>ještěže</word>
83
+ <word>ji</word>
84
+ <word>jinak</word>
85
+ <word>jít</word>
86
+ <word>jsem</word>
87
+ <word>jsi</word>
88
+ <word>jsme</word>
89
+ <word>jsou</word>
90
+ <word>jste</word>
91
+ <word>k</word>
92
+ <word>každý</word>
93
+ <word>kde</word>
94
+ <word>kdo</word>
95
+ <word>když</word>
96
+ <word>konečně</word>
97
+ <word>který</word>
98
+ <word>mají</word>
99
+ <word>mě</word>
100
+ <word>mimochodem</word>
101
+ <word>mít</word>
102
+ <word>moc</word>
103
+ <word>moci</word>
104
+ <word>moct</word>
105
+ <word>mohou</word>
106
+ <word>mohu</word>
107
+ <word>moje</word>
108
+ <word>moji</word>
109
+ <word>můj</word>
110
+ <word>může</word>
111
+ <word>my</word>
112
+ <word>na</word>
113
+ <word>naproti</word>
114
+ <word>náš</word>
115
+ <word>naše</word>
116
+ <word>ne</word>
117
+ <word>nebo</word>
118
+ <word>něco</word>
119
+ <word>někdy</word>
120
+ <word>není</word>
121
+ <word>nic</word>
122
+ <word>o</word>
123
+ <word>od</word>
124
+ <word>on</word>
125
+ <word>ona</word>
126
+ <word>oni</word>
127
+ <word>ono</word>
128
+ <word>ony</word>
129
+ <word>ovšem</word>
130
+ <word>po</word>
131
+ <word>protože</word>
132
+ <word>samozřejmě</word>
133
+ <word>se</word>
134
+ <word>slečna</word>
135
+ <word>tady</word>
136
+ <word>tak</word>
137
+ <word>také</word>
138
+ <word>taky</word>
139
+ <word>tam</word>
140
+ <word>ten</word>
141
+ <word>to</word>
142
+ <word>totiž</word>
143
+ <word>tu</word>
144
+ <word>ty</word>
145
+ <word>u</word>
146
+ <word>v</word>
147
+ <word>váš</word>
148
+ <word>vaše</word>
149
+ <word>ve</word>
150
+ <word>velmi</word>
151
+ <word>vlastní</word>
152
+ <word>vy</word>
153
+ <word>z</word>
154
+ <word>za</word>
155
+ <word>zase</word>
156
+ <word>zde</word>
157
+ <word>zítra</word>
158
+ <word>znova</word>
159
+ <word>že</word>
160
+ </grader-tc>
161
+ </dictionary>