summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,220 @@
1
+ /*
2
+ * wordlist.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ /*word lists manipulations , mainly for grader-tc */
29
+
30
+ OtsWordEntery *
31
+ ots_new_wordEntery_strip(unsigned const char *wordString,const OtsStemRule *rule) /*for real text use*/
32
+ {
33
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
34
+ aWord->occ = 1;
35
+ aWord->word = ots_stem_format(wordString,rule);
36
+ aWord->stem = ots_stem_strip(wordString,rule);
37
+ return aWord;
38
+ }
39
+
40
+ OtsWordEntery *
41
+ ots_new_wordEntery (unsigned const char *wordString) /*for dictionary use only, no formating here*/
42
+ {
43
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
44
+ aWord->occ = 1;
45
+ aWord->word = g_strdup (wordString);
46
+ aWord->stem = g_strdup (wordString);
47
+ return aWord;
48
+ }
49
+
50
+
51
+ void
52
+ ots_free_wordEntery (OtsWordEntery * WC)
53
+ {
54
+ if (WC != NULL)
55
+ {
56
+ if (NULL!=WC->word) g_free (WC->word);
57
+ if (NULL!=WC->stem) g_free (WC->stem);
58
+ g_free (WC);
59
+ }
60
+ }
61
+
62
+ void
63
+ ots_free_wordlist (GList * aList)
64
+ {
65
+ if (aList != NULL)
66
+ {
67
+ g_list_foreach(aList,(GFunc)ots_free_wordEntery , NULL);
68
+ g_list_free(aList);
69
+ }
70
+ }
71
+
72
+ OtsWordEntery *
73
+ ots_copy_wordEntery (OtsWordEntery * obj)
74
+ {
75
+ OtsWordEntery *aWord;
76
+ if (obj == NULL) { return NULL;}
77
+ aWord = g_new (OtsWordEntery, 1);
78
+ aWord->occ = obj->occ;
79
+ aWord->word = g_strdup (obj->word);
80
+ if (NULL!=obj->stem)
81
+ {aWord->stem = g_strdup (obj->stem);} else {aWord->stem=NULL;}
82
+ return aWord;
83
+ }
84
+
85
+ static int
86
+ ots_sort_handler (OtsWordEntery * node1, OtsWordEntery * node2)
87
+ {
88
+ if (node1->occ > node2->occ)
89
+ return -1;
90
+ if (node1->occ < node2->occ)
91
+ return 1;
92
+ return 0;
93
+ }
94
+
95
+ GList *
96
+ ots_sort_list (GList* aList)
97
+ {
98
+ GList *newList;
99
+ newList = g_list_sort (aList, (GCompareFunc) ots_sort_handler); /* sort article */
100
+ return newList;
101
+ }
102
+
103
+ GList *
104
+ ots_union_list (const GList *aLst, const GList * bLst)
105
+ {
106
+ GList *li;
107
+ GList *di;
108
+ int insert;
109
+ GList *newLst=NULL;
110
+
111
+ for (li = (GList *) aLst; li != NULL; li = li->next)
112
+ {
113
+ insert = 1;
114
+ for (di = (GList *) bLst; di != NULL; di = di->next)
115
+ {
116
+ if(( li->data) && (di->data) && (((OtsWordEntery *) li->data)->word) && (((OtsWordEntery *) di->data)->word)) /*all defined?*/
117
+ if (0 == g_strncasecmp ((((OtsWordEntery *) li->data)->word), /*fix me: unicode issue?*/
118
+ (((OtsWordEntery *) di->data)->word), 10))
119
+ insert = 0; /* if word in B */
120
+
121
+ }
122
+ if (insert == 1)
123
+ if ((li->data))
124
+ newLst = g_list_append (newLst,ots_copy_wordEntery ((OtsWordEntery *) li->data));
125
+ }
126
+
127
+ return newLst;
128
+ }
129
+
130
+
131
+ char *
132
+ ots_word_in_list (const GList *aList,const int index) /* return the String value of the n'th word */
133
+ {
134
+ OtsWordEntery *obj = NULL;
135
+
136
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
137
+ if (item != NULL) obj = item->data;
138
+ if (obj == NULL)
139
+ {
140
+ return NULL;
141
+ }
142
+ else
143
+ return obj->word;
144
+ }
145
+
146
+ char *
147
+ ots_stem_in_list (const GList *aList,const int index) /* return the String value of stem of the n'th word */
148
+ {
149
+ OtsWordEntery *obj = NULL;
150
+
151
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
152
+ if (item != NULL) obj = item->data;
153
+ if (obj == NULL)
154
+ {
155
+ return NULL;
156
+ }
157
+ else
158
+ return obj->stem;
159
+ }
160
+
161
+ /*Adds a word to the word count of the article*/
162
+ void
163
+ ots_add_wordstat (OtsArticle * Doc,
164
+ unsigned const char *wordString)
165
+ {
166
+ GList *li;
167
+ OtsWordEntery *stat;
168
+ OtsStemRule * rule=Doc->stem;
169
+ char *tmp = NULL;
170
+
171
+ if (NULL==wordString) return;
172
+ if (NULL==Doc) return;
173
+
174
+ if (0==strlen(wordString)) return;
175
+ if (0==strcmp(wordString," ")) return;
176
+ if (0==strcmp(wordString,"\n")) return;
177
+ if (0==strcmp(wordString,"\t")) return;
178
+
179
+ if (wordString)
180
+ tmp = ots_stem_strip (wordString, rule);
181
+
182
+ for (li = (GList *) Doc->wordStat; li != NULL; li = li->next) /* search the word in current wordlist */
183
+ {
184
+ if (li->data)
185
+ if (0 == strcmp (tmp, ((OtsWordEntery *) li->data)->stem))
186
+ {
187
+ ((OtsWordEntery *) li->data)->occ++; /* occurred in another place in the text now; */
188
+ g_free (tmp);
189
+
190
+ /*printf for debug*/
191
+ /*
192
+ if (0!=strcmp(((OtsWordEntery *) li->data)->word,wordString) )
193
+ printf("[%s]==[%s]\n",((OtsWordEntery *) li->data)->word,wordString);
194
+ */
195
+
196
+ return;
197
+ }
198
+ }
199
+
200
+ stat = ots_new_wordEntery_strip (wordString, rule); /* if not in list , Add stem it to the list */
201
+ if ((stat))
202
+ Doc->wordStat = g_list_prepend (Doc->wordStat, stat);
203
+ g_free (tmp);
204
+ return;
205
+ }
206
+
207
+
208
+
209
+
210
+ void
211
+ ots_print_wordlist (FILE * stream, const GList * aList)
212
+ {
213
+ GList *li;
214
+ for (li = (GList *) aList; li != NULL; li = li->next)
215
+ fprintf (stream, "Word[%d][%s]\n", ((OtsWordEntery *) li->data)->occ,
216
+ ((OtsWordEntery *) li->data)->word);
217
+ }
218
+
219
+
220
+
data/lib/summarize.rb ADDED
@@ -0,0 +1,91 @@
1
+ require 'summarize/summarize'
2
+
3
+ class Hash
4
+ def symbolize_keys
5
+ inject({}) do |options, (key, value)|
6
+ options[(key.to_sym rescue key) || key] = value
7
+ options
8
+ end
9
+ end
10
+ end unless {}.respond_to? 'symbolize_keys'
11
+
12
+ module Summarize
13
+ VERSION = "1.0.0"
14
+
15
+ LANGUAGES = [
16
+ 'bg', # Bulgarian
17
+ 'ca', # Catalan
18
+ 'cs', # Czech
19
+ 'cy', # Welsh
20
+ 'da', # Danish
21
+ 'de', # German
22
+ 'el', # Greek
23
+ 'en', # English
24
+ 'eo', # Esperanto
25
+ 'es', # Spanish
26
+ 'et', # Creole
27
+ 'eu', # Basque
28
+ 'fi', # Finnish
29
+ 'fr', # French
30
+ 'ga', # Irish
31
+ 'gl', # Galician
32
+ 'he', # Hebrew
33
+ 'hu', # Hungarian
34
+ 'ia', # Interlingua
35
+ 'id', # Indonesian
36
+ 'is', # Icelandic
37
+ 'it', # Italian
38
+ 'lv', # Latvian
39
+ 'mi', # Maori
40
+ 'ms', # Malay
41
+ 'mt', # Maltese
42
+ 'nl', # Dutch
43
+ 'nn', # Norweigan
44
+ 'pl', # Polish
45
+ 'pt', # Portuguese
46
+ 'ro', # Romanian
47
+ 'ru', # Russian
48
+ 'sv', # Swedish
49
+ 'tl', # Tagalog
50
+ 'tr', # Turkish
51
+ 'uk', # Ukrainian
52
+ 'yi' # Yiddish
53
+ ]
54
+
55
+ def self.parse_options(options = {})
56
+ default_options = {
57
+ :ratio => 25, # percentage
58
+ :language => 'en' # ISO 639-1 code
59
+ }
60
+
61
+ options = default_options.merge(options.symbolize_keys)
62
+
63
+ if options.key? :dictionary
64
+ dict_file = options[:dictionary]
65
+ else
66
+ raise "Language not supported" unless LANGUAGES.index(options[:language])
67
+ dict_file = File.join(File.expand_path(File.dirname(__FILE__)), "../ext/summarize/dic/#{options[:language]}")
68
+ end
69
+
70
+ return [dict_file, options[:ratio]]
71
+ end
72
+
73
+ end
74
+
75
+ class String
76
+ extend Summarize
77
+
78
+ def summarize(options = {})
79
+ dict_file, ratio = Summarize.parse_options(options)
80
+ String.send(:summarize, self, dict_file, ratio)
81
+ end
82
+
83
+ end
84
+
85
+ class File
86
+
87
+ def summarize(options = {})
88
+ self.read.summarize(options)
89
+ end
90
+
91
+ end
Binary file
@@ -0,0 +1,15 @@
1
+ Jupiter is the fifth planet from the Sun and the largest planet within the Solar System. It is a gas giant with a mass slightly less than one-thousandth of the Sun but is two and a half times the mass of all the other planets in our Solar System combined. Jupiter is classified as a gas giant along with Saturn, Uranus and Neptune. Together, these four planets are sometimes referred to as the Jovian planets.
2
+
3
+ The planet was known by astronomers of ancient times and was associated with the mythology and religious beliefs of many cultures. The Romans named the planet after the Roman god Jupiter. When viewed from Earth, Jupiter can reach an apparent magnitude of -2.94, making it on average the third-brightest object in the night sky after the Moon and Venus. (Mars can briefly match Jupiter's brightness at certain points in its orbit.)
4
+
5
+ Jupiter is primarily composed of hydrogen with a quarter of its mass being helium; it may also have a rocky core of heavier elements. Because of its rapid rotation, Jupiter's shape is that of an oblate spheroid (it possesses a slight but noticeable bulge around the equator). The outer atmosphere is visibly segregated into several bands at different latitudes, resulting in turbulence and storms along their interacting boundaries. A prominent result is the Great Red Spot, a giant storm that is known to have existed since at least the 17th century when it was first seen by telescope. Surrounding the planet is a faint planetary ring system and a powerful magnetosphere. There are also at least 63 moons, including the four large moons called the Galilean moons that were first discovered by Galileo Galilei in 1610. Ganymede, the largest of these moons, has a diameter greater than that of the planet Mercury.
6
+
7
+ Jupiter has been explored on several occasions by robotic spacecraft, most notably during the early Pioneer and Voyager flyby missions and later by the Galileo orbiter. The most recent probe to visit Jupiter was the Pluto-bound New Horizons spacecraft in late February 2007. The probe used the gravity from Jupiter to increase its speed. Future targets for exploration in the Jovian system include the possible ice-covered liquid ocean on the moon Europa.
8
+ ||
9
+ Jupiter is the fifth planet from the Sun and the largest planet within the Solar System. It is a gas giant with a mass slightly less than one-thousandth of the Sun but is two and a half times the mass of all the other planets in our Solar System combined. The Romans named the planet after the Roman god Jupiter. (Mars can briefly match Jupiter's brightness at certain points in its orbit.)
10
+
11
+ Jupiter is primarily composed of hydrogen with a quarter of its mass being helium; There are also at least 63 moons, including the four large moons called the Galilean moons that were first discovered by Galileo Galilei in 1610.
12
+ ||
13
+ Jupiter is the fifth planet from the Sun and the largest planet within the Solar System. It is a gas giant with a mass slightly less than one-thousandth of the Sun but is two and a half times the mass of all the other planets in our Solar System combined. Together, these four planets are sometimes referred to as the Jovian planets. The Romans named the planet after the Roman god Jupiter. When viewed from Earth, Jupiter can reach an apparent magnitude of -2.94, making it on average the third-brightest object in the night sky after the Moon and Venus. (Mars can briefly match Jupiter's brightness at certain points in its orbit.)
14
+
15
+ Jupiter is primarily composed of hydrogen with a quarter of its mass being helium; Because of its rapid rotation, Jupiter's shape is that of an oblate spheroid (it possesses a slight but noticeable bulge around the equator). There are also at least 63 moons, including the four large moons called the Galilean moons that were first discovered by Galileo Galilei in 1610. The most recent probe to visit Jupiter was the Pluto-bound New Horizons spacecraft in late February 2007.
data/summarize.gemspec ADDED
@@ -0,0 +1,21 @@
1
+ $:.unshift(File.join(File.dirname(__FILE__), "lib"))
2
+ require "summarize"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = %q{summarize}
6
+ s.summary = %q{Open Text Summarizer}
7
+ s.description = %q{Ruby C Extension for Open Text Summarizer}
8
+ s.homepage = %q{http://github.com/intridea}
9
+ s.version = Summarize::VERSION
10
+ s.authors = ["Sean Soper"]
11
+ s.email = %q{sean.soper@gmail.com}
12
+
13
+ s.rubygems_version = %q{1.3.7}
14
+ s.date = %q{2010-11-08}
15
+
16
+ s.require_paths = ["lib"]
17
+ s.extensions = ["ext/summarize/extconf.rb"]
18
+ s.files = %x{git ls-files}.split("\n").reject {|file| file =~ /^(features|cucumber)/ }
19
+
20
+ s.add_dependency 'rake-compiler'
21
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: summarize
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Sean Soper
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-08 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rake-compiler
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Ruby C Extension for Open Text Summarizer
36
+ email: sean.soper@gmail.com
37
+ executables: []
38
+
39
+ extensions:
40
+ - ext/summarize/extconf.rb
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - .gitignore
45
+ - README.markdown
46
+ - Rakefile
47
+ - ext/summarize/article.c
48
+ - ext/summarize/dic/bg.xml
49
+ - ext/summarize/dic/ca.xml
50
+ - ext/summarize/dic/cs.xml
51
+ - ext/summarize/dic/cy.xml
52
+ - ext/summarize/dic/da.xml
53
+ - ext/summarize/dic/de.xml
54
+ - ext/summarize/dic/el.xml
55
+ - ext/summarize/dic/en.xml
56
+ - ext/summarize/dic/eo.xml
57
+ - ext/summarize/dic/es.xml
58
+ - ext/summarize/dic/et.xml
59
+ - ext/summarize/dic/eu.xml
60
+ - ext/summarize/dic/fi.xml
61
+ - ext/summarize/dic/fr.xml
62
+ - ext/summarize/dic/ga.xml
63
+ - ext/summarize/dic/gl.xml
64
+ - ext/summarize/dic/he.xml
65
+ - ext/summarize/dic/hu.xml
66
+ - ext/summarize/dic/ia.xml
67
+ - ext/summarize/dic/id.xml
68
+ - ext/summarize/dic/is.xml
69
+ - ext/summarize/dic/it.xml
70
+ - ext/summarize/dic/lv.xml
71
+ - ext/summarize/dic/mi.xml
72
+ - ext/summarize/dic/ms.xml
73
+ - ext/summarize/dic/mt.xml
74
+ - ext/summarize/dic/nl.xml
75
+ - ext/summarize/dic/nn.xml
76
+ - ext/summarize/dic/pl.xml
77
+ - ext/summarize/dic/pt.xml
78
+ - ext/summarize/dic/ro.xml
79
+ - ext/summarize/dic/ru.xml
80
+ - ext/summarize/dic/sv.xml
81
+ - ext/summarize/dic/tl.xml
82
+ - ext/summarize/dic/tr.xml
83
+ - ext/summarize/dic/uk.xml
84
+ - ext/summarize/dic/yi.xml
85
+ - ext/summarize/dictionary.c
86
+ - ext/summarize/extconf.rb
87
+ - ext/summarize/grader-tc.c
88
+ - ext/summarize/grader-tc.h
89
+ - ext/summarize/grader-tf.c
90
+ - ext/summarize/grader.c
91
+ - ext/summarize/highlighter.c
92
+ - ext/summarize/html.c
93
+ - ext/summarize/libots.h
94
+ - ext/summarize/parser.c
95
+ - ext/summarize/relations.c
96
+ - ext/summarize/stemmer.c
97
+ - ext/summarize/summarize.c
98
+ - ext/summarize/summarize.h
99
+ - ext/summarize/text.c
100
+ - ext/summarize/wordlist.c
101
+ - lib/summarize.rb
102
+ - lib/summarize/summarize.bundle
103
+ - sample_data/jupiter.txt
104
+ - summarize.gemspec
105
+ has_rdoc: true
106
+ homepage: http://github.com/intridea
107
+ licenses: []
108
+
109
+ post_install_message:
110
+ rdoc_options: []
111
+
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ hash: 3
120
+ segments:
121
+ - 0
122
+ version: "0"
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ hash: 3
129
+ segments:
130
+ - 0
131
+ version: "0"
132
+ requirements: []
133
+
134
+ rubyforge_project:
135
+ rubygems_version: 1.3.7
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Open Text Summarizer
139
+ test_files: []
140
+