ots 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,80 @@
1
+ # OTS
2
+
3
+ ots is an interface to libots - The open text summarizer
4
+
5
+ ## Dependencies
6
+
7
+ * ruby 1.9.1 or later
8
+ * libxml2
9
+ * glib2.0
10
+ * homebrew (on MacOSX)
11
+
12
+ ## Installation
13
+
14
+ ### Debian flavors of Linux
15
+
16
+ ```
17
+
18
+ # ruby & ruby development libraries (not needed if you use rvm)
19
+ sudo apt-get install ruby1.9.1-dev ruby1.9.1
20
+
21
+ # libxml2 and glib development libraries
22
+ sudo apt-get install libxml2-dev libglib2.0-dev
23
+
24
+ # install ots
25
+ gem install ots
26
+
27
+ ```
28
+
29
+ ### MacOSX
30
+
31
+
32
+ ```
33
+
34
+ # update homebrew to latest & greatest version
35
+ GIT_SSL_NO_VERIFY=1 brew update
36
+
37
+ # install glib
38
+ brew install glib
39
+
40
+ # install ots
41
+ gem install ots
42
+
43
+ ```
44
+
45
+ ## API
46
+
47
+ ```
48
+ OTS
49
+ .parse #=> OTS::Article
50
+ .dictionaries #=> Array
51
+
52
+ OTS::Article
53
+ .new
54
+ #summarize #=> Array
55
+ #keywords #=> Array
56
+ #title #=> String
57
+
58
+ ```
59
+
60
+ ## Usage
61
+
62
+ ```ruby
63
+ require 'ots'
64
+ article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
65
+
66
+ article.keywords
67
+ article.summarize(lines: 1)
68
+ article.summarize(percent: 50)
69
+
70
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
71
+ article.keywords
72
+ article.summarize(lines: 1)
73
+ article.summarize(percent: 50)
74
+
75
+ OTS.dictionaries #=> list of supported dictionaries
76
+ ```
77
+
78
+ ## License
79
+
80
+ [Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
@@ -0,0 +1,101 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="bulgarian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>август</word>
62
+ <word>април</word>
63
+ <word>в</word>
64
+ <word>всеки</word>
65
+ <word>всичко</word>
66
+ <word>вторник</word>
67
+ <word>да</word>
68
+ <word>декември</word>
69
+ <word>за</word>
70
+ <word>и</word>
71
+ <word>или</word>
72
+ <word>има</word>
73
+ <word>което</word>
74
+ <word>към</word>
75
+ <word>май</word>
76
+ <word>март</word>
77
+ <word>на</word>
78
+ <word>не</word>
79
+ <word>неделя</word>
80
+ <word>ноември</word>
81
+ <word>октомври</word>
82
+ <word>от</word>
83
+ <word>петък</word>
84
+ <word>по</word>
85
+ <word>понеделник</word>
86
+ <word>при</word>
87
+ <word>с</word>
88
+ <word>септември</word>
89
+ <word>сряда</word>
90
+ <word>сто</word>
91
+ <word>събота</word>
92
+ <word>трябва</word>
93
+ <word>февруари</word>
94
+ <word>хиляда</word>
95
+ <word>че</word>
96
+ <word>четвъртък</word>
97
+ <word>юли</word>
98
+ <word>юни</word>
99
+ <word>януари</word>
100
+ </grader-tc>
101
+ </dictionary>
@@ -0,0 +1,141 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="catalan">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>abans</word>
63
+ <word>al</word>
64
+ <word>amb</word>
65
+ <word>ambdós</word>
66
+ <word>anar</word>
67
+ <word>ara</word>
68
+ <word>baix</word>
69
+ <word>cap</word>
70
+ <word>cert</word>
71
+ <word>com</word>
72
+ <word>cuál</word>
73
+ <word>damunt</word>
74
+ <word>de</word>
75
+ <word>dins</word>
76
+ <word>doble</word>
77
+ <word>dos</word>
78
+ <word>dues</word>
79
+ <word>el</word>
80
+ <word>ell</word>
81
+ <word>ella</word>
82
+ <word>elles</word>
83
+ <word>ells</word>
84
+ <word>els</word>
85
+ <word>en</word>
86
+ <word>ésser</word>
87
+ <word>estar</word>
88
+ <word>excepte</word>
89
+ <word>jo</word>
90
+ <word>la</word>
91
+ <word>les</word>
92
+ <word>lluny</word>
93
+ <word>lo</word>
94
+ <word>los</word>
95
+ <word>mai</word>
96
+ <word>me</word>
97
+ <word>meu</word>
98
+ <word>meus</word>
99
+ <word>meva</word>
100
+ <word>meves</word>
101
+ <word>mí</word>
102
+ <word>na</word>
103
+ <word>nos</word>
104
+ <word>nosaltres</word>
105
+ <word>nostra</word>
106
+ <word>nostre</word>
107
+ <word>nostres</word>
108
+ <word>qual</word>
109
+ <word>quals</word>
110
+ <word>quan</word>
111
+ <word>quelcom</word>
112
+ <word>quin</word>
113
+ <word>quina</word>
114
+ <word>quines</word>
115
+ <word>quins</word>
116
+ <word>se</word>
117
+ <word>ser</word>
118
+ <word>seu</word>
119
+ <word>seus</word>
120
+ <word>seva</word>
121
+ <word>seves</word>
122
+ <word>sí</word>
123
+ <word>tenir</word>
124
+ <word>teu</word>
125
+ <word>teus</word>
126
+ <word>teva</word>
127
+ <word>teves</word>
128
+ <word>tu</word>
129
+ <word>u</word>
130
+ <word>un</word>
131
+ <word>una</word>
132
+ <word>unes</word>
133
+ <word>uns</word>
134
+ <word>vosaltres</word>
135
+ <word>vostè</word>
136
+ <word>vostès</word>
137
+ <word>vostra</word>
138
+ <word>vostre</word>
139
+ <word>vostres</word>
140
+ </grader-tc>
141
+ </dictionary>
@@ -0,0 +1,161 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="czech">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>aby</word>
63
+ <word>ale</word>
64
+ <word>ani</word>
65
+ <word>ano</word>
66
+ <word>až</word>
67
+ <word>být</word>
68
+ <word>co</word>
69
+ <word>dělat</word>
70
+ <word>dnes</word>
71
+ <word>do</word>
72
+ <word>doma</word>
73
+ <word>domů</word>
74
+ <word>i</word>
75
+ <word>já</word>
76
+ <word>jak</word>
77
+ <word>jako</word>
78
+ <word>je</word>
79
+ <word>jen</word>
80
+ <word>jenom</word>
81
+ <word>ještě</word>
82
+ <word>ještěže</word>
83
+ <word>ji</word>
84
+ <word>jinak</word>
85
+ <word>jít</word>
86
+ <word>jsem</word>
87
+ <word>jsi</word>
88
+ <word>jsme</word>
89
+ <word>jsou</word>
90
+ <word>jste</word>
91
+ <word>k</word>
92
+ <word>každý</word>
93
+ <word>kde</word>
94
+ <word>kdo</word>
95
+ <word>když</word>
96
+ <word>konečně</word>
97
+ <word>který</word>
98
+ <word>mají</word>
99
+ <word>mě</word>
100
+ <word>mimochodem</word>
101
+ <word>mít</word>
102
+ <word>moc</word>
103
+ <word>moci</word>
104
+ <word>moct</word>
105
+ <word>mohou</word>
106
+ <word>mohu</word>
107
+ <word>moje</word>
108
+ <word>moji</word>
109
+ <word>můj</word>
110
+ <word>může</word>
111
+ <word>my</word>
112
+ <word>na</word>
113
+ <word>naproti</word>
114
+ <word>náš</word>
115
+ <word>naše</word>
116
+ <word>ne</word>
117
+ <word>nebo</word>
118
+ <word>něco</word>
119
+ <word>někdy</word>
120
+ <word>není</word>
121
+ <word>nic</word>
122
+ <word>o</word>
123
+ <word>od</word>
124
+ <word>on</word>
125
+ <word>ona</word>
126
+ <word>oni</word>
127
+ <word>ono</word>
128
+ <word>ony</word>
129
+ <word>ovšem</word>
130
+ <word>po</word>
131
+ <word>protože</word>
132
+ <word>samozřejmě</word>
133
+ <word>se</word>
134
+ <word>slečna</word>
135
+ <word>tady</word>
136
+ <word>tak</word>
137
+ <word>také</word>
138
+ <word>taky</word>
139
+ <word>tam</word>
140
+ <word>ten</word>
141
+ <word>to</word>
142
+ <word>totiž</word>
143
+ <word>tu</word>
144
+ <word>ty</word>
145
+ <word>u</word>
146
+ <word>v</word>
147
+ <word>váš</word>
148
+ <word>vaše</word>
149
+ <word>ve</word>
150
+ <word>velmi</word>
151
+ <word>vlastní</word>
152
+ <word>vy</word>
153
+ <word>z</word>
154
+ <word>za</word>
155
+ <word>zase</word>
156
+ <word>zde</word>
157
+ <word>zítra</word>
158
+ <word>znova</word>
159
+ <word>že</word>
160
+ </grader-tc>
161
+ </dictionary>