RubyGems - summarize - Versions diffs - 1.0.0 - Mend

summarize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/.gitignore +11 -0
data/README.markdown +42 -0
data/Rakefile +49 -0
data/ext/summarize/article.c +119 -0
data/ext/summarize/dic/bg.xml +101 -0
data/ext/summarize/dic/ca.xml +141 -0
data/ext/summarize/dic/cs.xml +161 -0
data/ext/summarize/dic/cy.xml +118 -0
data/ext/summarize/dic/da.xml +129 -0
data/ext/summarize/dic/de.xml +354 -0
data/ext/summarize/dic/el.xml +80 -0
data/ext/summarize/dic/en.xml +606 -0
data/ext/summarize/dic/eo.xml +171 -0
data/ext/summarize/dic/es.xml +369 -0
data/ext/summarize/dic/et.xml +172 -0
data/ext/summarize/dic/eu.xml +77 -0
data/ext/summarize/dic/fi.xml +105 -0
data/ext/summarize/dic/fr.xml +199 -0
data/ext/summarize/dic/ga.xml +124 -0
data/ext/summarize/dic/gl.xml +290 -0
data/ext/summarize/dic/he.xml +334 -0
data/ext/summarize/dic/hu.xml +280 -0
data/ext/summarize/dic/ia.xml +97 -0
data/ext/summarize/dic/id.xml +75 -0
data/ext/summarize/dic/is.xml +201 -0
data/ext/summarize/dic/it.xml +206 -0
data/ext/summarize/dic/lv.xml +77 -0
data/ext/summarize/dic/mi.xml +76 -0
data/ext/summarize/dic/ms.xml +160 -0
data/ext/summarize/dic/mt.xml +73 -0
data/ext/summarize/dic/nl.xml +245 -0
data/ext/summarize/dic/nn.xml +264 -0
data/ext/summarize/dic/pl.xml +92 -0
data/ext/summarize/dic/pt.xml +365 -0
data/ext/summarize/dic/ro.xml +163 -0
data/ext/summarize/dic/ru.xml +150 -0
data/ext/summarize/dic/sv.xml +255 -0
data/ext/summarize/dic/tl.xml +67 -0
data/ext/summarize/dic/tr.xml +65 -0
data/ext/summarize/dic/uk.xml +98 -0
data/ext/summarize/dic/yi.xml +293 -0
data/ext/summarize/dictionary.c +331 -0
data/ext/summarize/extconf.rb +6 -0
data/ext/summarize/grader-tc.c +185 -0
data/ext/summarize/grader-tc.h +64 -0
data/ext/summarize/grader-tf.c +116 -0
data/ext/summarize/grader.c +85 -0
data/ext/summarize/highlighter.c +128 -0
data/ext/summarize/html.c +131 -0
data/ext/summarize/libots.h +158 -0
data/ext/summarize/parser.c +173 -0
data/ext/summarize/relations.c +163 -0
data/ext/summarize/stemmer.c +332 -0
data/ext/summarize/summarize.c +43 -0
data/ext/summarize/summarize.h +12 -0
data/ext/summarize/text.c +98 -0
data/ext/summarize/wordlist.c +220 -0
data/lib/summarize.rb +91 -0
data/lib/summarize/summarize.bundle +0 -0
data/sample_data/jupiter.txt +15 -0
data/summarize.gemspec +21 -0
metadata +140 -0

data/ext/summarize/grader-tc.h ADDED Viewed

@@ -0,0 +1,64 @@
+/*
+ *  grader-tc.h
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef HAVE_GRADERTC_H
+#define HAVE_GRADERTC_H
+#include <glib.h>
+#include "libots.h"
+G_BEGIN_DECLS
+typedef struct
+{
+  gchar *word;    /* the word */
+  gchar *stem;    /*stem of the word*/
+  gint occ;			/* how many times have we seen this word in the text? */
+} OtsWordEntery;
+/*Word list manipulations*/
+void ots_free_wordlist (GList *aList);
+OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj);
+OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString);
+OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule);
+void ots_free_wordEntery (OtsWordEntery * WC);
+GList *ots_sort_list (GList* aList);
+GList *ots_union_list (const GList *aLst, const GList * bLst);
+char *ots_word_in_list (const GList *aList,const int index);
+char *ots_stem_in_list (const GList *aList,const int index);
+void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
+/*grader*/
+void ots_grade_doc_tc (OtsArticle * Doc);
+G_END_DECLS
+#endif /* HAVE_GRADERTC_H */

data/ext/summarize/grader-tf.c ADDED Viewed

@@ -0,0 +1,116 @@
+/*
+ *  grader-tf.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+/*Grader - using the Term frequency algorithm. Will give each line a score*/
+OtsWordTF*
+ots_new_OtsWordTF(const char* word,const double tf)
+{
+ OtsWordTF* obj=g_new0(OtsWordTF,1);
+ if (word!=NULL) obj->word=g_strdup(word);
+ obj->tf=tf;
+ return obj;
+}
+void
+ots_free_OtsWordTF(OtsWordTF *obj)
+{
+ if (obj!=NULL)
+  {
+   if (obj->word!=NULL) g_free(obj->word);
+   g_free(obj);
+  }
+}
+void
+ots_free_TF_wordlist (GList * aList)
+{
+  if (aList != NULL)
+    {
+  		 	g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL);
+    			g_list_free(aList);
+    }
+}
+void
+ots_grade_line_tf (OtsSentence * aLine)
+{
+return;
+}
+void
+ots_grade_doc_tf (OtsArticle * Doc)
+{
+ GList *li;
+  /*Load tf list*/
+  /*Load idf list*/
+ if (0 == Doc->lineCount) return;
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/);
+    }
+return;
+}
+double
+ots_tf_word_score (const double tf,const double idf)
+/*IDF: how rare is word across the collection
+  TF: how often is word in doc */
+{
+return tf*idf;
+}
+/*
+Determine frequency of query words
+n = (num-of-sentences words appears in)
+N = (total-number-of-sentences)
+f = n/N
+*/
+double
+ots_calc_idf (const int term_count,const int doc_word_count)
+{
+return -log(doc_word_count/term_count);
+}
+double
+ots_calc_tf (const int term_count,const int doc_word_count)
+{
+if (term_count==0) return 0; else
+return 0.5+0.5*(doc_word_count/term_count);
+}

data/ext/summarize/grader.c ADDED Viewed

@@ -0,0 +1,85 @@
+/*
+ *  grader.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+extern void ots_grade_doc_tc (OtsArticle * Doc);
+/*Grader driver - will call one of the grading algorithm*/
+void
+ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/
+{
+  GList *li;
+  GList *first;
+  GList *second;
+  OtsSentence *first_line=NULL;
+  first = NULL;
+  second = NULL;
+ if (Doc==NULL) return;
+ if (Doc->lines!=NULL)
+  first_line= ((OtsSentence *) (Doc->lines->data));
+  if (NULL!=first_line) first_line->score *= 2;	/*first line/title is very important so we increase its score */
+ 	 /*This loop will *1.6 the score of each line that
+ 	 starts with \n \n , in other words a new paragraph*/
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+  {
+      OtsSentence *aLine = (li->data);
+      if (NULL != aLine)	/*line is there */
+		{
+		  first = aLine->words;	/*first word? */
+		  if (NULL != first)
+	  	  second = first->next;	/*second word? */
+	  		if ((NULL != first) && (NULL != second))	/*have content? */
+	    	if (strcmp (first->data, "\n") && strcmp (second->data, "\n"))	/*new paragraph? */
+	   		  	 aLine->score *= 1.6;
+		}
+  }
+}
+/**
+Each grader needs to do:
+1.give a ->score to each line
+2.Set the ->title of the document
+**/
+void
+ots_grade_doc (OtsArticle * Doc)
+{
+ if (Doc==NULL) return;
+  ots_grade_doc_tc(Doc);  /*Term count*/
+	/* or   ots_grade_doc_fc (Doc); Term Frequency  */
+  ots_grade_structure (Doc);
+}

data/ext/summarize/highlighter.c ADDED Viewed

@@ -0,0 +1,128 @@
+/*
+ *  highlighter
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+/*After the grader has graded the article and each
+ sentence has a score the highlighter will select
+ some of the sentences*/
+static int
+ots_highlight_max_line (OtsArticle * Doc)
+{
+  GList *li;
+  int max = 0;
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      if (0 == (((OtsSentence *) li->data)->selected))	/* if not selected , count me in */
+	max = MAX (((OtsSentence *) li->data)->score, max);
+    }
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0))	/* if score==max && not selected before ,select me; */
+	{
+	  ((OtsSentence *) li->data)->selected = 1;
+	  return ((OtsSentence *) li->data)->wc;
+	}
+    }
+  return 0;
+}
+/* todo: impement this
+void
+ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
+void
+ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
+void
+ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
+*/
+void
+ots_highlight_doc (OtsArticle * Doc, int percent)
+{
+  int i;
+  double ratio;
+  int wordCount;
+  if (0 == Doc->lineCount)
+    return;
+  if (percent > 100)
+    percent = 100;
+  else if (percent < 0)
+    percent = 0;
+  ratio = ((double) (percent)) / (100.0);
+  wordCount = ots_get_article_word_count (Doc);
+  for (i = 0; i < (ratio * (double) wordCount);)
+    {
+      i += ots_highlight_max_line (Doc);
+    }
+}
+void
+ots_highlight_doc_lines (OtsArticle * Doc, int lines)
+{
+  int i;
+  int lineCount;
+  int tmp;
+  if (0 == Doc->lineCount) return;
+  lineCount = Doc->lineCount;
+  i=0;
+  while ((i<lines)&&(i<lineCount))
+  {
+  i++;
+  tmp=ots_highlight_max_line (Doc);
+  }
+}
+void ots_highlight_doc_words (OtsArticle * Doc, int words)
+{
+  int i;
+  int docWordCount;
+  if (0 == Doc->lineCount) return;
+  docWordCount = ots_get_article_word_count (Doc);
+	i=0;
+  while ((i < docWordCount) && (i <= words))
+    {
+      i += ots_highlight_max_line (Doc);
+    }
+}

data/ext/summarize/html.c ADDED Viewed

@@ -0,0 +1,131 @@
+/*
+ *  html.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+static unsigned char *
+ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  char *score_str;
+  text = g_string_new (NULL);
+  score_str=g_new0(char,32);
+  sprintf(score_str,"<!--(%ld)-->",aLine->score);
+  g_string_append (text,score_str);
+  g_free(score_str);
+  if ((aLine->selected))
+    {
+      g_string_append (text,
+		       "<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
+    }
+  else
+    {
+      g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
+    }
+  for (li = (GList *) aLine->words; li != NULL; li = li->next)
+    {
+      if (0 == strcmp ((char *) li->data, "\n"))
+	g_string_append (text, "<br>");
+      else
+	g_string_append (text, (char *) li->data);
+    }
+  g_string_append (text,"</span></FONT>\n");
+  if (out_size)
+    *out_size = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+#if 0
+static void
+ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
+{
+  unsigned char *utf8_txt;
+  size_t len;
+  utf8_txt = ots_get_line_HTML (aLine, &len);
+  fwrite (utf8_txt, 1, len, stream);
+  g_free (utf8_txt);
+}
+#endif
+unsigned char *
+ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  size_t line_len;
+  text = g_string_new (NULL);
+  g_string_append (text,
+		   "<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
+  g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
+  g_string_append (text, "<!--");
+  g_string_append (text, Doc->title);
+  g_string_append (text, "-->\n");
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
+      g_string_append_len (text, utf8_data, line_len);
+      g_free (utf8_data);
+    }
+  g_string_append (text, "</body></html>\n");
+  if (out_len)
+    *out_len = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+void
+ots_print_HTML (FILE * stream, const OtsArticle * Doc)
+{
+  unsigned char *utf8_txt;
+  size_t len;
+  utf8_txt = ots_get_doc_HTML (Doc, &len);
+  fwrite (utf8_txt, 1, len, stream);
+  g_free (utf8_txt);
+}