ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/README.md
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# OTS
|
2
|
+
|
3
|
+
ots is an interface to libots - The open text summarizer
|
4
|
+
|
5
|
+
## Dependencies
|
6
|
+
|
7
|
+
* ruby 1.9.1 or later
|
8
|
+
* libxml2
|
9
|
+
* glib2.0
|
10
|
+
* homebrew (on MacOSX)
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
### Debian flavors of Linux
|
15
|
+
|
16
|
+
```
|
17
|
+
|
18
|
+
# ruby & ruby development libraries (not needed if you use rvm)
|
19
|
+
sudo apt-get install ruby1.9.1-dev ruby1.9.1
|
20
|
+
|
21
|
+
# libxml2 and glib development libraries
|
22
|
+
sudo apt-get install libxml2-dev libglib2.0-dev
|
23
|
+
|
24
|
+
# install ots
|
25
|
+
gem install ots
|
26
|
+
|
27
|
+
```
|
28
|
+
|
29
|
+
### MacOSX
|
30
|
+
|
31
|
+
|
32
|
+
```
|
33
|
+
|
34
|
+
# update homebrew to latest & greatest version
|
35
|
+
GIT_SSL_NO_VERIFY=1 brew update
|
36
|
+
|
37
|
+
# install glib
|
38
|
+
brew install glib
|
39
|
+
|
40
|
+
# install ots
|
41
|
+
gem install ots
|
42
|
+
|
43
|
+
```
|
44
|
+
|
45
|
+
## API
|
46
|
+
|
47
|
+
```
|
48
|
+
OTS
|
49
|
+
.parse #=> OTS::Article
|
50
|
+
.dictionaries #=> Array
|
51
|
+
|
52
|
+
OTS::Article
|
53
|
+
.new
|
54
|
+
#summarize #=> Array
|
55
|
+
#keywords #=> Array
|
56
|
+
#title #=> String
|
57
|
+
|
58
|
+
```
|
59
|
+
|
60
|
+
## Usage
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
require 'ots'
|
64
|
+
article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
|
65
|
+
|
66
|
+
article.keywords
|
67
|
+
article.summarize(lines: 1)
|
68
|
+
article.summarize(percent: 50)
|
69
|
+
|
70
|
+
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
|
71
|
+
article.keywords
|
72
|
+
article.summarize(lines: 1)
|
73
|
+
article.summarize(percent: 50)
|
74
|
+
|
75
|
+
OTS.dictionaries #=> list of supported dictionaries
|
76
|
+
```
|
77
|
+
|
78
|
+
## License
|
79
|
+
|
80
|
+
[Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
|
data/dictionaries/bg.xml
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="bulgarian">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>август</word>
|
62
|
+
<word>април</word>
|
63
|
+
<word>в</word>
|
64
|
+
<word>всеки</word>
|
65
|
+
<word>всичко</word>
|
66
|
+
<word>вторник</word>
|
67
|
+
<word>да</word>
|
68
|
+
<word>декември</word>
|
69
|
+
<word>за</word>
|
70
|
+
<word>и</word>
|
71
|
+
<word>или</word>
|
72
|
+
<word>има</word>
|
73
|
+
<word>което</word>
|
74
|
+
<word>към</word>
|
75
|
+
<word>май</word>
|
76
|
+
<word>март</word>
|
77
|
+
<word>на</word>
|
78
|
+
<word>не</word>
|
79
|
+
<word>неделя</word>
|
80
|
+
<word>ноември</word>
|
81
|
+
<word>октомври</word>
|
82
|
+
<word>от</word>
|
83
|
+
<word>петък</word>
|
84
|
+
<word>по</word>
|
85
|
+
<word>понеделник</word>
|
86
|
+
<word>при</word>
|
87
|
+
<word>с</word>
|
88
|
+
<word>септември</word>
|
89
|
+
<word>сряда</word>
|
90
|
+
<word>сто</word>
|
91
|
+
<word>събота</word>
|
92
|
+
<word>трябва</word>
|
93
|
+
<word>февруари</word>
|
94
|
+
<word>хиляда</word>
|
95
|
+
<word>че</word>
|
96
|
+
<word>четвъртък</word>
|
97
|
+
<word>юли</word>
|
98
|
+
<word>юни</word>
|
99
|
+
<word>януари</word>
|
100
|
+
</grader-tc>
|
101
|
+
</dictionary>
|
data/dictionaries/ca.xml
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="catalan">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>a</word>
|
62
|
+
<word>abans</word>
|
63
|
+
<word>al</word>
|
64
|
+
<word>amb</word>
|
65
|
+
<word>ambdós</word>
|
66
|
+
<word>anar</word>
|
67
|
+
<word>ara</word>
|
68
|
+
<word>baix</word>
|
69
|
+
<word>cap</word>
|
70
|
+
<word>cert</word>
|
71
|
+
<word>com</word>
|
72
|
+
<word>cuál</word>
|
73
|
+
<word>damunt</word>
|
74
|
+
<word>de</word>
|
75
|
+
<word>dins</word>
|
76
|
+
<word>doble</word>
|
77
|
+
<word>dos</word>
|
78
|
+
<word>dues</word>
|
79
|
+
<word>el</word>
|
80
|
+
<word>ell</word>
|
81
|
+
<word>ella</word>
|
82
|
+
<word>elles</word>
|
83
|
+
<word>ells</word>
|
84
|
+
<word>els</word>
|
85
|
+
<word>en</word>
|
86
|
+
<word>ésser</word>
|
87
|
+
<word>estar</word>
|
88
|
+
<word>excepte</word>
|
89
|
+
<word>jo</word>
|
90
|
+
<word>la</word>
|
91
|
+
<word>les</word>
|
92
|
+
<word>lluny</word>
|
93
|
+
<word>lo</word>
|
94
|
+
<word>los</word>
|
95
|
+
<word>mai</word>
|
96
|
+
<word>me</word>
|
97
|
+
<word>meu</word>
|
98
|
+
<word>meus</word>
|
99
|
+
<word>meva</word>
|
100
|
+
<word>meves</word>
|
101
|
+
<word>mí</word>
|
102
|
+
<word>na</word>
|
103
|
+
<word>nos</word>
|
104
|
+
<word>nosaltres</word>
|
105
|
+
<word>nostra</word>
|
106
|
+
<word>nostre</word>
|
107
|
+
<word>nostres</word>
|
108
|
+
<word>qual</word>
|
109
|
+
<word>quals</word>
|
110
|
+
<word>quan</word>
|
111
|
+
<word>quelcom</word>
|
112
|
+
<word>quin</word>
|
113
|
+
<word>quina</word>
|
114
|
+
<word>quines</word>
|
115
|
+
<word>quins</word>
|
116
|
+
<word>se</word>
|
117
|
+
<word>ser</word>
|
118
|
+
<word>seu</word>
|
119
|
+
<word>seus</word>
|
120
|
+
<word>seva</word>
|
121
|
+
<word>seves</word>
|
122
|
+
<word>sí</word>
|
123
|
+
<word>tenir</word>
|
124
|
+
<word>teu</word>
|
125
|
+
<word>teus</word>
|
126
|
+
<word>teva</word>
|
127
|
+
<word>teves</word>
|
128
|
+
<word>tu</word>
|
129
|
+
<word>u</word>
|
130
|
+
<word>un</word>
|
131
|
+
<word>una</word>
|
132
|
+
<word>unes</word>
|
133
|
+
<word>uns</word>
|
134
|
+
<word>vosaltres</word>
|
135
|
+
<word>vostè</word>
|
136
|
+
<word>vostès</word>
|
137
|
+
<word>vostra</word>
|
138
|
+
<word>vostre</word>
|
139
|
+
<word>vostres</word>
|
140
|
+
</grader-tc>
|
141
|
+
</dictionary>
|
data/dictionaries/cs.xml
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="czech">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>a</word>
|
62
|
+
<word>aby</word>
|
63
|
+
<word>ale</word>
|
64
|
+
<word>ani</word>
|
65
|
+
<word>ano</word>
|
66
|
+
<word>až</word>
|
67
|
+
<word>být</word>
|
68
|
+
<word>co</word>
|
69
|
+
<word>dělat</word>
|
70
|
+
<word>dnes</word>
|
71
|
+
<word>do</word>
|
72
|
+
<word>doma</word>
|
73
|
+
<word>domů</word>
|
74
|
+
<word>i</word>
|
75
|
+
<word>já</word>
|
76
|
+
<word>jak</word>
|
77
|
+
<word>jako</word>
|
78
|
+
<word>je</word>
|
79
|
+
<word>jen</word>
|
80
|
+
<word>jenom</word>
|
81
|
+
<word>ještě</word>
|
82
|
+
<word>ještěže</word>
|
83
|
+
<word>ji</word>
|
84
|
+
<word>jinak</word>
|
85
|
+
<word>jít</word>
|
86
|
+
<word>jsem</word>
|
87
|
+
<word>jsi</word>
|
88
|
+
<word>jsme</word>
|
89
|
+
<word>jsou</word>
|
90
|
+
<word>jste</word>
|
91
|
+
<word>k</word>
|
92
|
+
<word>každý</word>
|
93
|
+
<word>kde</word>
|
94
|
+
<word>kdo</word>
|
95
|
+
<word>když</word>
|
96
|
+
<word>konečně</word>
|
97
|
+
<word>který</word>
|
98
|
+
<word>mají</word>
|
99
|
+
<word>mě</word>
|
100
|
+
<word>mimochodem</word>
|
101
|
+
<word>mít</word>
|
102
|
+
<word>moc</word>
|
103
|
+
<word>moci</word>
|
104
|
+
<word>moct</word>
|
105
|
+
<word>mohou</word>
|
106
|
+
<word>mohu</word>
|
107
|
+
<word>moje</word>
|
108
|
+
<word>moji</word>
|
109
|
+
<word>můj</word>
|
110
|
+
<word>může</word>
|
111
|
+
<word>my</word>
|
112
|
+
<word>na</word>
|
113
|
+
<word>naproti</word>
|
114
|
+
<word>náš</word>
|
115
|
+
<word>naše</word>
|
116
|
+
<word>ne</word>
|
117
|
+
<word>nebo</word>
|
118
|
+
<word>něco</word>
|
119
|
+
<word>někdy</word>
|
120
|
+
<word>není</word>
|
121
|
+
<word>nic</word>
|
122
|
+
<word>o</word>
|
123
|
+
<word>od</word>
|
124
|
+
<word>on</word>
|
125
|
+
<word>ona</word>
|
126
|
+
<word>oni</word>
|
127
|
+
<word>ono</word>
|
128
|
+
<word>ony</word>
|
129
|
+
<word>ovšem</word>
|
130
|
+
<word>po</word>
|
131
|
+
<word>protože</word>
|
132
|
+
<word>samozřejmě</word>
|
133
|
+
<word>se</word>
|
134
|
+
<word>slečna</word>
|
135
|
+
<word>tady</word>
|
136
|
+
<word>tak</word>
|
137
|
+
<word>také</word>
|
138
|
+
<word>taky</word>
|
139
|
+
<word>tam</word>
|
140
|
+
<word>ten</word>
|
141
|
+
<word>to</word>
|
142
|
+
<word>totiž</word>
|
143
|
+
<word>tu</word>
|
144
|
+
<word>ty</word>
|
145
|
+
<word>u</word>
|
146
|
+
<word>v</word>
|
147
|
+
<word>váš</word>
|
148
|
+
<word>vaše</word>
|
149
|
+
<word>ve</word>
|
150
|
+
<word>velmi</word>
|
151
|
+
<word>vlastní</word>
|
152
|
+
<word>vy</word>
|
153
|
+
<word>z</word>
|
154
|
+
<word>za</word>
|
155
|
+
<word>zase</word>
|
156
|
+
<word>zde</word>
|
157
|
+
<word>zítra</word>
|
158
|
+
<word>znova</word>
|
159
|
+
<word>že</word>
|
160
|
+
</grader-tc>
|
161
|
+
</dictionary>
|