ots 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -1,26 +1,25 @@
1
1
  require 'mkmf'
2
2
 
3
- $CFLAGS = %x{pkg-config --cflags glib-2.0}.strip
4
- $LGFLAGS = %x{pkg-config --libs glib-2.0}.strip
3
+ glib_cflags = %x{pkg-config --cflags glib-2.0}.strip
4
+ glib_ldflags = %x{pkg-config --libs glib-2.0}.strip
5
5
 
6
- if $CFLAGS.empty?
6
+ if glib_cflags.empty?
7
7
  warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_INCLUDE_DIR env to override.}
8
8
  dirs = ENV.fetch('GLIB_INCLUDE_DIR', '/usr/include/glib-2.0 /usr/lib/glib-2.0/include')
9
- $CFLAGS = dirs.split(/\s+/).map {|dir| "-I#{dir}"}.join(' ')
9
+ glib_cflags = dirs.split(/\s+/).map {|dir| "-I#{dir}"}.join(' ')
10
10
  end
11
11
 
12
- if $LDFLAGS.empty?
12
+ if glib_ldflags.empty?
13
13
  warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_LIB env to override.}
14
14
  libs = ENV.fetch('GLIB_LIB', 'glib-2.0')
15
- $LDFLAGS = libs.split(/\s+/).map {|lib| "-l#{lib}"}.join(' ')
15
+ glib_ldflags = libs.split(/\s+/).map {|lib| "-l#{lib}"}.join(' ')
16
16
  end
17
17
 
18
- dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
18
+ dir = File.expand_path(File.dirname(__FILE__) + '/../dictionaries')
19
+ $CFLAGS = glib_cflags + %Q{ -I/usr/include/libxml2 -DDICTIONARY_DIR='"#{dir}/"'}
20
+ $LDFLAGS = glib_ldflags
19
21
 
20
- headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
21
- if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
22
- create_makefile 'ots'
23
- else
24
- puts "Cannot find libots headers or libraries"
25
- exit 1
26
- end
22
+ find_library('glib-2.0', 'main')
23
+ find_library('xml2', 'main')
24
+
25
+ create_makefile 'ots'
@@ -0,0 +1,185 @@
1
+ /*
2
+ * grader-tc.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+
27
+ #include "grader-tc.h"
28
+
29
+
30
+ /*Grader - Term count algorithm*/
31
+ /*This is non-normelized term frequency algorithm without using inverse document frequency database */
32
+
33
+ #define NUM_KEY_WORDS 100 /* use first n key words only */
34
+
35
+ int
36
+ ots_get_article_word_count (const OtsArticle * Doc)
37
+ {
38
+ GList *li;
39
+ int articleWC;
40
+ articleWC = 0;
41
+
42
+ if (Doc==NULL) return 0;
43
+
44
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
45
+ {
46
+ articleWC += ((OtsSentence *) li->data)->wc;
47
+ }
48
+
49
+ return articleWC;
50
+ }
51
+
52
+
53
+ /*take this line and add each word to the "wordStat" list
54
+ * this list will hold all of the words in the article and the number
55
+ * of times they appeared in the article.
56
+ */
57
+
58
+ static void
59
+ ots_line_add_wordlist(OtsArticle * Doc,const OtsSentence * aLine)
60
+ {
61
+ GList *li;
62
+ if ((aLine==NULL) ||(NULL==Doc)) { return;}
63
+
64
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
65
+ if (li->data && strlen (li->data)) ots_add_wordstat (Doc, (char *)li->data);
66
+
67
+ return;
68
+ }
69
+
70
+ static void
71
+ ots_create_wordlist(OtsArticle * Doc)
72
+ {
73
+ GList *line;
74
+ if (Doc==NULL) return;
75
+
76
+ for (line = (GList *) Doc->lines; line != NULL; line = line->next)
77
+ {
78
+ OtsSentence * aLine=line->data;
79
+ if (aLine)
80
+ ots_line_add_wordlist(Doc,aLine);
81
+ }
82
+ }
83
+
84
+
85
+
86
+
87
+ static int
88
+ keyVal (const int n) /* Ugly , I know */
89
+ {
90
+ if (n == 1) return 3;
91
+ if (n == 2) return 2;
92
+ if (n == 3) return 2;
93
+ if (n == 4) return 2;
94
+ return 1;
95
+ }
96
+
97
+
98
+ static void
99
+ ots_grade_line (GList *impList, OtsSentence * aLine,
100
+ OtsStemRule * rule)
101
+ {
102
+ GList *li;
103
+ GList *di;
104
+ int n;
105
+ char *tmp_stem;
106
+
107
+ if ((aLine==NULL)||(rule==NULL)||(impList==NULL)) return;
108
+
109
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word */
110
+ {
111
+ n = 0;
112
+ tmp_stem = ots_stem_strip ((unsigned char *) li->data, rule);
113
+
114
+ for (di = (GList *) impList;
115
+ ((di != NULL) && (n < NUM_KEY_WORDS)); di = di->next)
116
+ {
117
+ n++;
118
+ if ((NULL!=((OtsWordEntery *) di->data)->stem) && (NULL!=tmp_stem))
119
+ if (0 == strcmp ((((OtsWordEntery *) di->data)->stem), tmp_stem))
120
+ {
121
+ /* debug:
122
+ if (0!=strcmp((((OtsWordEntery *) di->data)->word),li->data))
123
+ printf("[%s][%s] stem[%s]\n",(((OtsWordEntery *) di->data)->word),li->data,tmp);*/
124
+
125
+ aLine->score += (((OtsWordEntery *) di->data)->occ) * keyVal (n);
126
+ }
127
+
128
+ }
129
+
130
+ g_free (tmp_stem);
131
+ }
132
+
133
+ }
134
+
135
+
136
+ void
137
+ ots_create_title_tc(OtsArticle * Doc)
138
+ {
139
+
140
+ char *tmp;
141
+ char *word;
142
+ int i;
143
+ GString *title;
144
+ if (NULL==Doc) return;
145
+
146
+ title=g_string_new(NULL);
147
+
148
+ for (i=0;i<5;i++)
149
+ {
150
+ word = ots_word_in_list(Doc->ImpWords,i);
151
+ if (word) g_string_append(title,word); else break;
152
+ if (i<4) g_string_append(title,",");
153
+ }
154
+
155
+ tmp=title->str;
156
+ if (NULL!=title) g_string_free(title,FALSE);
157
+ Doc->title=tmp;
158
+ }
159
+
160
+
161
+ void
162
+ ots_grade_doc_tc (OtsArticle * Doc)
163
+ {
164
+
165
+ GList *li;
166
+ if (NULL==Doc) return;
167
+ ots_create_wordlist(Doc);
168
+
169
+
170
+ Doc->ImpWords=ots_union_list (Doc->wordStat, Doc->dict); /* subtract from the Article wordlist all the words in the dic file (on , the , is...) */
171
+ Doc->ImpWords=ots_sort_list (Doc->ImpWords); /* sort the list , top 3 is what the article talks about (SARS , virus , cure ... ) */
172
+
173
+ /*to print wordlist: ots_print_wordlist (stdout, Doc->ImpWords);*/
174
+
175
+ if (0 == Doc->lineCount) return;
176
+
177
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
178
+ {
179
+ if (li->data)
180
+ ots_grade_line (Doc->ImpWords, (OtsSentence *) li->data, Doc->stem);
181
+ }
182
+
183
+
184
+ ots_create_title_tc(Doc);
185
+ }
@@ -0,0 +1,64 @@
1
+ /*
2
+ * grader-tc.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_GRADERTC_H
22
+ #define HAVE_GRADERTC_H
23
+
24
+
25
+ #include <glib.h>
26
+ #include "libots.h"
27
+
28
+ G_BEGIN_DECLS
29
+
30
+
31
+ typedef struct
32
+ {
33
+ gchar *word; /* the word */
34
+ gchar *stem; /*stem of the word*/
35
+ gint occ; /* how many times have we seen this word in the text? */
36
+ } OtsWordEntery;
37
+
38
+ /*Word list manipulations*/
39
+ void ots_free_wordlist (GList *aList);
40
+
41
+
42
+
43
+ OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj);
44
+ OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString);
45
+ OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule);
46
+ void ots_free_wordEntery (OtsWordEntery * WC);
47
+
48
+ GList *ots_sort_list (GList* aList);
49
+ GList *ots_union_list (const GList *aLst, const GList * bLst);
50
+
51
+ char *ots_word_in_list (const GList *aList,const int index);
52
+ char *ots_stem_in_list (const GList *aList,const int index);
53
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
54
+
55
+
56
+ /*grader*/
57
+
58
+ void ots_grade_doc_tc (OtsArticle * Doc);
59
+
60
+ G_END_DECLS
61
+
62
+
63
+
64
+ #endif /* HAVE_GRADERTC_H */
@@ -0,0 +1,116 @@
1
+ /*
2
+ * grader-tf.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*Grader - using the Term frequency algorithm. Will give each line a score*/
27
+
28
+
29
+
30
+ OtsWordTF*
31
+ ots_new_OtsWordTF(const char* word,const double tf)
32
+ {
33
+ OtsWordTF* obj=g_new0(OtsWordTF,1);
34
+ if (word!=NULL) obj->word=g_strdup(word);
35
+ obj->tf=tf;
36
+ return obj;
37
+ }
38
+
39
+ void
40
+ ots_free_OtsWordTF(OtsWordTF *obj)
41
+ {
42
+ if (obj!=NULL)
43
+ {
44
+ if (obj->word!=NULL) g_free(obj->word);
45
+ g_free(obj);
46
+ }
47
+ }
48
+
49
+ void
50
+ ots_free_TF_wordlist (GList * aList)
51
+ {
52
+ if (aList != NULL)
53
+ {
54
+ g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL);
55
+ g_list_free(aList);
56
+ }
57
+ }
58
+
59
+
60
+ void
61
+ ots_grade_line_tf (OtsSentence * aLine)
62
+ {
63
+
64
+ return;
65
+ }
66
+
67
+
68
+
69
+ void
70
+ ots_grade_doc_tf (OtsArticle * Doc)
71
+ {
72
+
73
+ GList *li;
74
+
75
+ /*Load tf list*/
76
+ /*Load idf list*/
77
+
78
+ if (0 == Doc->lineCount) return;
79
+
80
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
81
+ {
82
+ ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/);
83
+ }
84
+
85
+ return;
86
+ }
87
+
88
+
89
+ double
90
+ ots_tf_word_score (const double tf,const double idf)
91
+ /*IDF: how rare is word across the collection
92
+ TF: how often is word in doc */
93
+ {
94
+
95
+ return tf*idf;
96
+ }
97
+
98
+ /*
99
+ Determine frequency of query words
100
+ n = (num-of-sentences words appears in)
101
+ N = (total-number-of-sentences)
102
+ f = n/N
103
+ */
104
+
105
+ double
106
+ ots_calc_idf (const int term_count,const int doc_word_count)
107
+ {
108
+ return -log(doc_word_count/term_count);
109
+ }
110
+
111
+ double
112
+ ots_calc_tf (const int term_count,const int doc_word_count)
113
+ {
114
+ if (term_count==0) return 0; else
115
+ return 0.5+0.5*(doc_word_count/term_count);
116
+ }
@@ -0,0 +1,85 @@
1
+ /*
2
+ * grader.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ extern void ots_grade_doc_tc (OtsArticle * Doc);
27
+
28
+ /*Grader driver - will call one of the grading algorithm*/
29
+
30
+
31
+
32
+ void
33
+ ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/
34
+ {
35
+ GList *li;
36
+ GList *first;
37
+ GList *second;
38
+ OtsSentence *first_line=NULL;
39
+
40
+ first = NULL;
41
+ second = NULL;
42
+
43
+ if (Doc==NULL) return;
44
+
45
+ if (Doc->lines!=NULL)
46
+ first_line= ((OtsSentence *) (Doc->lines->data));
47
+ if (NULL!=first_line) first_line->score *= 2; /*first line/title is very important so we increase its score */
48
+
49
+ /*This loop will *1.6 the score of each line that
50
+ starts with \n \n , in other words a new paragraph*/
51
+
52
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
53
+ {
54
+ OtsSentence *aLine = (li->data);
55
+ if (NULL != aLine) /*line is there */
56
+ {
57
+ first = aLine->words; /*first word? */
58
+ if (NULL != first)
59
+ second = first->next; /*second word? */
60
+ if ((NULL != first) && (NULL != second)) /*have content? */
61
+ if (strcmp (first->data, "\n") && strcmp (second->data, "\n")) /*new paragraph? */
62
+ aLine->score *= 1.6;
63
+ }
64
+
65
+ }
66
+
67
+ }
68
+
69
+ /**
70
+ Each grader needs to do:
71
+ 1.give a ->score to each line
72
+ 2.Set the ->title of the document
73
+ **/
74
+
75
+ void
76
+ ots_grade_doc (OtsArticle * Doc)
77
+ {
78
+
79
+ if (Doc==NULL) return;
80
+ ots_grade_doc_tc(Doc); /*Term count*/
81
+
82
+ /* or ots_grade_doc_fc (Doc); Term Frequency */
83
+
84
+ ots_grade_structure (Doc);
85
+ }