RubyGems - ots - Versions diffs - 0.4.3 → 0.4.4 - Mend

ots 0.4.3 → 0.4.4

Files changed (64) hide show

data/ext/text.c ADDED

@@ -0,0 +1,98 @@
+/*
+ *  text.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+unsigned char *
+ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  if (!(aLine))
+    return NULL;
+  text = g_string_new (NULL);
+  if (!only_if_selected || aLine->selected)
+    {
+      for (li = (GList *) aLine->words; li != NULL; li = li->next)	/* for each word in the sentence Do: */
+			if (li->data && strlen (li->data)) /*if word exists*/
+	  			g_string_append (text, (char *) li->data);
+    }
+  if (out_size)
+    *out_size = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+static void
+ots_print_line (FILE * stream, const OtsSentence * aLine)
+{
+  unsigned char *utf8_txt;
+  size_t len;
+  utf8_txt = ots_get_line_text (aLine, TRUE, &len);
+  fwrite (utf8_txt, 1, len, stream);
+  g_free (utf8_txt);
+}
+unsigned char *
+ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
+{
+  GList *li;
+  GString *text;
+  unsigned char *utf8_data;
+  size_t line_len;
+  text = g_string_new (NULL);
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)
+    {
+      utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
+      g_string_append_len (text, utf8_data, line_len);
+      g_free (utf8_data);
+    }
+  if (out_len)
+    *out_len = text->len;
+  utf8_data = text->str;
+  g_string_free (text, FALSE);
+  return utf8_data;
+}
+void
+ots_print_doc (FILE * stream, const OtsArticle * Doc)
+{
+  GList *li;
+  for (li = (GList *) Doc->lines; li != NULL; li = li->next)	/* for each line in Article Do: */
+    ots_print_line (stream, (OtsSentence *) li->data);
+  fputc ('\n', stream);
+}

data/ext/version.h ADDED

	@@ -0,0 +1,2 @@
1	+ #pragma once
2	+ #define RUBY_OTS_VERSION "0.4.4"

data/ext/wordlist.c ADDED

@@ -0,0 +1,220 @@
+/*
+ *  wordlist.c
+ *
+ *  Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libots.h"
+#include "grader-tc.h"
+/*word lists manipulations , mainly for grader-tc */
+OtsWordEntery *
+ots_new_wordEntery_strip(unsigned const char *wordString,const OtsStemRule *rule) /*for real text use*/
+{
+  OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
+  aWord->occ = 1;
+  aWord->word = ots_stem_format(wordString,rule);
+  aWord->stem = ots_stem_strip(wordString,rule);
+  return aWord;
+}
+OtsWordEntery *
+ots_new_wordEntery (unsigned const char *wordString) /*for dictionary use only, no formating here*/
+{
+  OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
+  aWord->occ = 1;
+  aWord->word = g_strdup (wordString);
+  aWord->stem = g_strdup (wordString);
+  return aWord;
+}
+void
+ots_free_wordEntery (OtsWordEntery * WC)
+{
+  if (WC != NULL)
+    {
+      if (NULL!=WC->word) g_free (WC->word);
+      if (NULL!=WC->stem) g_free (WC->stem);
+      g_free (WC);
+    }
+}
+void
+ots_free_wordlist (GList * aList)
+{
+  if (aList != NULL)
+    {
+  		 	g_list_foreach(aList,(GFunc)ots_free_wordEntery , NULL);
+    		g_list_free(aList);
+    }
+}
+OtsWordEntery *
+ots_copy_wordEntery (OtsWordEntery * obj)
+{
+  OtsWordEntery *aWord;
+  if (obj == NULL) { return NULL;}
+  aWord = g_new (OtsWordEntery, 1);
+  aWord->occ = obj->occ;
+  aWord->word = g_strdup (obj->word);
+  if (NULL!=obj->stem)
+  {aWord->stem = g_strdup (obj->stem);} else {aWord->stem=NULL;}
+  return aWord;
+}
+static int
+ots_sort_handler (OtsWordEntery * node1, OtsWordEntery * node2)
+{
+  if (node1->occ > node2->occ)
+    return -1;
+  if (node1->occ < node2->occ)
+    return 1;
+  return 0;
+}
+GList *
+ots_sort_list (GList* aList)
+{
+  GList *newList;
+  newList = g_list_sort (aList, (GCompareFunc) ots_sort_handler);	/* sort article */
+  return newList;
+}
+GList *
+ots_union_list (const GList *aLst, const GList * bLst)
+{
+  GList *li;
+  GList *di;
+  int insert;
+  GList *newLst=NULL;
+  for (li = (GList *) aLst; li != NULL; li = li->next)
+    {
+      insert = 1;
+      for (di = (GList *) bLst; di != NULL; di = di->next)
+		{
+		 if(( li->data) && (di->data) && (((OtsWordEntery *) li->data)->word) && (((OtsWordEntery *) di->data)->word)) /*all defined?*/
+	    if (0 == g_strncasecmp ((((OtsWordEntery *) li->data)->word), /*fix me: unicode issue?*/
+				  (((OtsWordEntery *) di->data)->word), 10))
+	    insert = 0;		/* if word in B */
+		}
+      if (insert == 1)
+	if ((li->data))
+      newLst = g_list_append (newLst,ots_copy_wordEntery ((OtsWordEntery *) li->data));
+    }
+    return newLst;
+}
+char *
+ots_word_in_list (const GList *aList,const int index)	/* return the String value of the n'th word */
+{
+  OtsWordEntery *obj = NULL;
+  GList *item =(GList *)g_list_nth ((GList *)aList, index);
+  if (item != NULL) obj = item->data;
+  if (obj == NULL)
+    {
+      return NULL;
+    }
+  else
+    return obj->word;
+}
+char *
+ots_stem_in_list (const GList *aList,const int index)	/* return the String value of stem of the n'th word */
+{
+  OtsWordEntery *obj = NULL;
+  GList *item =(GList *)g_list_nth ((GList *)aList, index);
+  if (item != NULL) obj = item->data;
+  if (obj == NULL)
+    {
+      return NULL;
+    }
+  else
+    return obj->stem;
+}
+/*Adds a word to the word count of the article*/
+void
+ots_add_wordstat (OtsArticle * Doc,
+		  unsigned const char *wordString)
+{
+  GList *li;
+  OtsWordEntery *stat;
+  OtsStemRule * rule=Doc->stem;
+  char *tmp = NULL;
+  if (NULL==wordString) return;
+  if (NULL==Doc) return;
+  if (0==strlen(wordString)) return;
+  if (0==strcmp(wordString," ")) return;
+  if (0==strcmp(wordString,"\n")) return;
+  if (0==strcmp(wordString,"\t")) return;
+  if (wordString)
+  tmp = ots_stem_strip (wordString, rule);
+  for (li = (GList *) Doc->wordStat; li != NULL; li = li->next)	/* search the word in current wordlist */
+    {
+      if (li->data)
+      if (0 == strcmp (tmp, ((OtsWordEntery *) li->data)->stem))
+	{
+	  ((OtsWordEntery *) li->data)->occ++;	/* occurred in another place in the text now; */
+	  g_free (tmp);
+		/*printf for debug*/
+	    /*
+	    if (0!=strcmp(((OtsWordEntery *) li->data)->word,wordString)  )
+	     printf("[%s]==[%s]\n",((OtsWordEntery *) li->data)->word,wordString);
+	    */
+	  return;
+	}
+    }
+  stat = ots_new_wordEntery_strip (wordString, rule);	/* if not in list , Add  stem  it to the list */
+  if ((stat))
+  Doc->wordStat = g_list_prepend (Doc->wordStat, stat);
+  g_free (tmp);
+  return;
+}
+void
+ots_print_wordlist (FILE * stream, const GList * aList)
+{
+  GList *li;
+  for (li = (GList *) aList; li != NULL; li = li->next)
+    fprintf (stream, "Word[%d][%s]\n", ((OtsWordEntery *) li->data)->occ,
+	    		 			((OtsWordEntery *) li->data)->word);
+}

data/test/helper.rb ADDED

@@ -0,0 +1,3 @@
+require 'ots'
+require 'minitest/spec'
+require 'minitest/autorun'

data/test/test_article.rb ADDED

@@ -0,0 +1,52 @@
+# encoding: utf-8
+require 'helper'
+describe 'OTS::Article' do
+  before do
+    @sample = <<-TEXT
+      The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
+      It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
+      Pacific subspecies.
+    TEXT
+    @article = OTS::Article.new(@sample)
+  end
+  it 'should extract title keywords from given document' do
+    assert_equal 'species,turtle,subspecies,pacific,atlantic', @article.title
+  end
+  it 'should extract keywords from given document' do
+    expect = %w{
+      species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
+      belonging sea endangered critically hawksbill
+    }
+    assert_equal expect, @article.keywords
+  end
+  it 'should extract keywords from given document' do
+    lines  = @article.summarize(lines: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
+    expect = [
+      ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
+      ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
+    ]
+    assert_equal expect, lines
+  end
+  it 'should utf8 encode strings properly' do
+    text    = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
+    article = OTS.parse(text)
+    summary = article.summarize(lines: 1).first[:sentence]
+    assert_equal text, summary
+  end
+  describe 'dictionaries' do
+    it 'should load the french dictionary' do
+      article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
+      assert_equal "j'ai besoin de la crème glacée.", article.summarize(lines: 1).first[:sentence]
+    end
+  end
+end

data/test/test_ots.rb ADDED

@@ -0,0 +1,23 @@
+require 'helper'
+describe 'OTS' do
+  it 'parse() should return an article instance' do
+    OTS.parse("hello world").must_be_kind_of OTS::Article
+  end
+  it 'parse() should raise ArgumentError on invalid text' do
+    assert_raises(ArgumentError) do
+      OTS.parse(1)
+    end
+  end
+  it 'should return a list of dictonaries' do
+    dictionaries = OTS.dictionaries
+    %w(en fr it es de ru).each do |name|
+      assert dictionaries.include?(name), "has #{name} dictionary"
+    end
+    assert_empty dictionaries.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
+  end
+end

metadata CHANGED

@@ -1,64 +1,148 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: ots
-version: !ruby/object:Gem::Version
-  version: 0.4.3
-  prerelease:
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 4
+  - 4
+  version: 0.4.4
 platform: ruby
-authors:
+authors:
 - Bharanee Rathna
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-23 00:00:00.000000000Z
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: shoulda
-  requirement: &17368280 !ruby/object:Gem::Requirement
+date: 2012-01-09 00:00:00 +11:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '2.10'
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
   type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
   prerelease: false
-  version_requirements: *17368280
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id002
 description: Ruby interface to libots libraries for unix.
-email: deepfryed@gmail.com
+email:
+- deepfryed@gmail.com
 executables: []
-extensions:
+extensions:
 - ext/extconf.rb
-extra_rdoc_files:
-- README
-files:
-- README
-- VERSION
+extra_rdoc_files: []
+files:
+- ext/text.c
+- ext/grader-tf.c
+- ext/stemmer.c
+- ext/article.c
+- ext/grader-tc.c
+- ext/html.c
+- ext/grader.c
 - ext/ots.c
-- lib/ots.rb
-- test/ots_test.rb
+- ext/relations.c
+- ext/parser.c
+- ext/dictionary.c
+- ext/highlighter.c
+- ext/wordlist.c
+- ext/grader-tc.h
+- ext/ots.h
+- ext/libots.h
+- ext/version.h
 - ext/extconf.rb
+- test/test_article.rb
+- test/test_ots.rb
+- test/helper.rb
+- README.md
+- dictionaries/cy.xml
+- dictionaries/tr.xml
+- dictionaries/fr.xml
+- dictionaries/yi.xml
+- dictionaries/ms.xml
+- dictionaries/ia.xml
+- dictionaries/lv.xml
+- dictionaries/gl.xml
+- dictionaries/cs.xml
+- dictionaries/sv.xml
+- dictionaries/is.xml
+- dictionaries/fi.xml
+- dictionaries/bg.xml
+- dictionaries/uk.xml
+- dictionaries/et.xml
+- dictionaries/tl.xml
+- dictionaries/da.xml
+- dictionaries/it.xml
+- dictionaries/ru.xml
+- dictionaries/nl.xml
+- dictionaries/eo.xml
+- dictionaries/mi.xml
+- dictionaries/ro.xml
+- dictionaries/pl.xml
+- dictionaries/ga.xml
+- dictionaries/he.xml
+- dictionaries/mt.xml
+- dictionaries/eu.xml
+- dictionaries/hu.xml
+- dictionaries/en.xml
+- dictionaries/de.xml
+- dictionaries/el.xml
+- dictionaries/pt.xml
+- dictionaries/ca.xml
+- dictionaries/es.xml
+- dictionaries/nn.xml
+- dictionaries/id.xml
+has_rdoc: true
 homepage: http://github.com/deepfryed/ots
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.2
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Open Text Summarizer interface for Ruby.
-test_files:
-- test/ots_test.rb
+test_files: []