RubyGems - summarize - Versions diffs - 1.0.0 - Mend

summarize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/.gitignore +11 -0
data/README.markdown +42 -0
data/Rakefile +49 -0
data/ext/summarize/article.c +119 -0
data/ext/summarize/dic/bg.xml +101 -0
data/ext/summarize/dic/ca.xml +141 -0
data/ext/summarize/dic/cs.xml +161 -0
data/ext/summarize/dic/cy.xml +118 -0
data/ext/summarize/dic/da.xml +129 -0
data/ext/summarize/dic/de.xml +354 -0
data/ext/summarize/dic/el.xml +80 -0
data/ext/summarize/dic/en.xml +606 -0
data/ext/summarize/dic/eo.xml +171 -0
data/ext/summarize/dic/es.xml +369 -0
data/ext/summarize/dic/et.xml +172 -0
data/ext/summarize/dic/eu.xml +77 -0
data/ext/summarize/dic/fi.xml +105 -0
data/ext/summarize/dic/fr.xml +199 -0
data/ext/summarize/dic/ga.xml +124 -0
data/ext/summarize/dic/gl.xml +290 -0
data/ext/summarize/dic/he.xml +334 -0
data/ext/summarize/dic/hu.xml +280 -0
data/ext/summarize/dic/ia.xml +97 -0
data/ext/summarize/dic/id.xml +75 -0
data/ext/summarize/dic/is.xml +201 -0
data/ext/summarize/dic/it.xml +206 -0
data/ext/summarize/dic/lv.xml +77 -0
data/ext/summarize/dic/mi.xml +76 -0
data/ext/summarize/dic/ms.xml +160 -0
data/ext/summarize/dic/mt.xml +73 -0
data/ext/summarize/dic/nl.xml +245 -0
data/ext/summarize/dic/nn.xml +264 -0
data/ext/summarize/dic/pl.xml +92 -0
data/ext/summarize/dic/pt.xml +365 -0
data/ext/summarize/dic/ro.xml +163 -0
data/ext/summarize/dic/ru.xml +150 -0
data/ext/summarize/dic/sv.xml +255 -0
data/ext/summarize/dic/tl.xml +67 -0
data/ext/summarize/dic/tr.xml +65 -0
data/ext/summarize/dic/uk.xml +98 -0
data/ext/summarize/dic/yi.xml +293 -0
data/ext/summarize/dictionary.c +331 -0
data/ext/summarize/extconf.rb +6 -0
data/ext/summarize/grader-tc.c +185 -0
data/ext/summarize/grader-tc.h +64 -0
data/ext/summarize/grader-tf.c +116 -0
data/ext/summarize/grader.c +85 -0
data/ext/summarize/highlighter.c +128 -0
data/ext/summarize/html.c +131 -0
data/ext/summarize/libots.h +158 -0
data/ext/summarize/parser.c +173 -0
data/ext/summarize/relations.c +163 -0
data/ext/summarize/stemmer.c +332 -0
data/ext/summarize/summarize.c +43 -0
data/ext/summarize/summarize.h +12 -0
data/ext/summarize/text.c +98 -0
data/ext/summarize/wordlist.c +220 -0
data/lib/summarize.rb +91 -0
data/lib/summarize/summarize.bundle +0 -0
data/sample_data/jupiter.txt +15 -0
data/summarize.gemspec +21 -0
metadata +140 -0

data/ext/summarize/stemmer.c ADDED Viewed

@@ -0,0 +1,332 @@
+/*
+ *  stemmer.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+#define MAX_PREFIX_SIZE 256
+OtsStemRule *
+new_stem_rule ()
+{
+  OtsStemRule *rule = g_new0 (OtsStemRule, 1);
+  return rule;
+}
+void
+free_stem_rule (OtsStemRule *rule)
+{
+if (rule != NULL)
+    {
+      g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL);
+      g_list_free (rule->RemovePre);
+      g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL);
+      g_list_free (rule->RemovePost);
+      g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL);
+      g_list_free (rule->step1_pre);
+      g_list_foreach (rule->step1_post, (GFunc) g_free, NULL);
+      g_list_free (rule->step1_post);
+      g_list_foreach (rule->synonyms, (GFunc) g_free, NULL);
+      g_list_free (rule->synonyms);
+      g_list_foreach (rule->manual, (GFunc) g_free, NULL);
+      g_list_free (rule->manual);
+      g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL);
+      g_list_free (rule->ParserBreak);
+      g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL);
+      g_list_free (rule->ParserDontBreak);
+      g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL);
+      g_list_free (rule->ReplaceChars);
+      g_free (rule);
+    }
+  return;
+}
+static void
+ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b)	/*given already alocated part_a and b */
+{				/*example "red|blue" */
+  int i, j, clen;
+  i = 0;
+  j = 0;
+  if (comp==NULL) return;
+  if (part_a==NULL) return;
+  if (part_b==NULL) return;
+  clen = strlen (comp);
+  part_a[0] = 0;
+  part_b[0] = 0;
+  while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|'))
+    {
+      part_a[i] = comp[i];
+      i++;
+    }
+  part_a[i] = 0;
+  i++;				/*skip the | mark */
+  while (i < clen && (j < MAX_PREFIX_SIZE))
+    {
+      part_b[j] = comp[i];
+      i++;
+      j++;
+    }
+  part_b[j] = 0;
+  return;
+}
+static unsigned char *
+ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new)
+{
+  int i, plen, wlen, nlen;
+  unsigned char *new_str = NULL;
+  if (aWord==NULL) return NULL;
+  plen = strlen (pre);
+  wlen = strlen (aWord);
+  nlen = strlen (new);
+  for (i = 0; i < plen; i++)
+    if (aWord[i] != pre[i])
+      return NULL;		/*no match */
+  new_str = g_new0 (char, wlen + nlen +5);
+  for (i = 0; i <= nlen; i++)
+    new_str[i] = new[i];
+  for (i = nlen; i <= nlen + wlen - plen; i++)
+    new_str[i] = aWord[i + plen - nlen];
+  new_str[i + 1] = 0;
+  return new_str;
+}
+static unsigned char *
+ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new)
+{
+  unsigned int i, wlen, plen, nlen;
+  unsigned char *new_str = NULL;
+  if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL;
+  wlen = strlen (aWord);
+  plen = strlen (post);
+  nlen = strlen (new);
+  if (plen>wlen) return NULL;
+  for (i = 0; i < plen; i++)
+    if (aWord[wlen - plen + i]!= post[i])
+      return NULL;		/* no match */
+  new_str = g_new0 (char, wlen + nlen +5);
+  for (i = 0; i <= wlen - plen; i++)	/*place word */
+    new_str[i] = aWord[i];
+  for (i = 0; i <= nlen; i++)	/*place newfix */
+    new_str[wlen - plen + i] = new[i];
+  return new_str;		/*word replaced */
+}
+static unsigned char *
+ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new)
+{
+  if (aWord==NULL) return NULL;
+if ((aWord)&&(0 == strcmp (aWord, old)))
+    {
+      return g_strdup (new);
+    }
+  else
+    {
+      return NULL;
+    }
+}
+unsigned char *
+ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule)
+{
+  GList *li;
+  unsigned char *rep = NULL;
+  unsigned char *normWord = NULL;
+  if (aWord==NULL) return NULL;
+  normWord = g_utf8_strdown (aWord, -1);	/*lowercase the word */
+  char *prefix;
+  char *newfix;
+  prefix = g_new0 (char, MAX_PREFIX_SIZE);
+  newfix = g_new0 (char, MAX_PREFIX_SIZE);
+ for (li = (GList *) rule->step1_pre; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_remove_pre (normWord, prefix, newfix);
+      if (NULL != rep)
+		{
+	  		g_free (normWord);
+	  		normWord = rep;
+	  		rep = NULL;
+		}
+    }
+ for (li = (GList *) rule->step1_post; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_remove_post(normWord, prefix, newfix);
+      if (NULL != rep)
+		{
+	  		g_free (normWord);
+	  		normWord = rep;
+	  		rep = NULL;
+		}
+    }
+  g_free (prefix);
+  g_free (newfix);
+  return normWord;
+}
+unsigned char *
+ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule)
+{
+  GList *li;
+  unsigned char *rep = NULL;
+  unsigned char *prefix;
+  unsigned char *newfix;
+  unsigned char *normWord=NULL;
+  prefix = g_new0 (char, MAX_PREFIX_SIZE);
+  newfix = g_new0 (char, MAX_PREFIX_SIZE);
+  if (aWord==NULL) return NULL;
+  normWord = ots_stem_format (aWord,rule);
+  for (li = (GList *) rule->manual; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_replace_word (normWord, prefix, newfix);
+      if (NULL != rep)
+	{
+	  g_free (normWord);
+	  normWord = rep;
+	  rep = NULL;
+	  break;
+	}
+    }
+  for (li = (GList *) rule->RemovePre; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_remove_pre (normWord, prefix, newfix);
+      if (NULL != rep)
+	{
+	  g_free (normWord);
+	  normWord = rep;
+	  rep = NULL;
+	  break;
+	}
+    }
+  for (li = (GList *) rule->RemovePost; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_remove_post (normWord, prefix, newfix);
+      if (NULL != rep)
+	{
+	  g_free (normWord);
+	  normWord = rep;
+	  rep = NULL;
+	  break;
+	}
+    }
+  for (li = (GList *) rule->synonyms; li != NULL; li = li->next)
+    {
+      ots_stem_break (li->data, prefix, newfix);
+      rep = ots_stem_replace_word (normWord, prefix, newfix);
+      if (NULL != rep)
+	{
+	  g_free (normWord);
+	  normWord = rep;
+	  rep = NULL;
+	  break;
+	}
+    }
+  g_free (prefix);
+  g_free (newfix);
+  if (strlen(normWord)<3)  /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/
+		{
+		g_free(normWord);
+		normWord = ots_stem_format (aWord,rule);	/*lowercase the word */
+		}
+  return normWord;
+}

data/ext/summarize/summarize.c ADDED Viewed

@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <glib.h>
+#include <glib-object.h>
+#include <ruby.h>
+#include "libots.h"
+#include "summarize.h"
+const char *OTS_ERROR_BAD_DICT = "Cannot load dictionary file";
+void Init_summarize() {
+ VALUE rb_mOts = rb_define_module("Summarize");
+ rb_define_module_function(rb_mOts, "summarize", summarize, 3);
+}
+static VALUE summarize(const VALUE self, const VALUE rb_str, const VALUE rb_dict_file, const VALUE rb_ratio) {
+  int length = RSTRING_LEN(rb_str);
+  char *text = StringValuePtr(rb_str);
+  char *dictionary_file = StringValuePtr(rb_dict_file);
+  int ratio = NUM2INT(rb_ratio);
+  unsigned char *result;
+  size_t result_len;
+  OtsArticle *doc = ots_new_article();
+  if (!ots_load_xml_dictionary(doc, dictionary_file)) {
+    ots_free_article(doc);
+    rb_raise(rb_eRuntimeError, OTS_ERROR_BAD_DICT);
+    return Qnil;
+  }
+  ots_parse_stream(text, length, doc);
+  ots_grade_doc(doc);
+  ots_highlight_doc(doc, ratio);
+  result = ots_get_doc_text(doc, &result_len);
+  ots_free_article(doc);
+  return rb_str_new2(result);
+}

data/ext/summarize/summarize.h ADDED Viewed

@@ -0,0 +1,12 @@
+#ifndef RSTRING_PTR
+#define RSTRING_PTR(s) (RSTRING(s)->ptr)
+#endif
+#ifndef RSTRING_LEN
+#define RSTRING_LEN(s) (RSTRING(s)->len)
+#endif
+#ifndef __summarize_h__
+#define __summarize_h__
+static VALUE summarize(VALUE, VALUE, VALUE, VALUE);
+#endif

data/ext/summarize/text.c ADDED Viewed

@@ -0,0 +1,98 @@
+/*
+ *  text.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+unsigned char *
+ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  if (!(aLine))
+    return NULL;
+  text = g_string_new (NULL);
+  if (!only_if_selected || aLine->selected)
+    {
+      for (li = (GList *) aLine->words; li != NULL; li = li->next)	/* for each word in the sentence Do: */
+			if (li->data && strlen (li->data)) /*if word exists*/
+	  			g_string_append (text, (char *) li->data);
+    }
+  if (out_size)
+    *out_size = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+static void
+ots_print_line (FILE * stream, const OtsSentence * aLine)
+{
+  unsigned char *utf8_txt;
+  size_t len;
+  utf8_txt = ots_get_line_text (aLine, TRUE, &len);
+  fwrite (utf8_txt, 1, len, stream);
+  g_free (utf8_txt);
+}
+unsigned char *
+ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  size_t line_len;
+  text = g_string_new (NULL);
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
+      g_string_append_len (text, utf8_data, line_len);
+      g_free (utf8_data);
+    }
+  if (out_len)
+    *out_len = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+void
+ots_print_doc (FILE * stream, const OtsArticle * Doc)
+{
+  GList *li;
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)	/* for each line in Article Do: */
+    ots_print_line (stream, (OtsSentence *) li->data);
+  fputc ('\n', stream);
+}