ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -1,26 +1,25 @@
1
1
  require 'mkmf'
2
2
 
3
- $CFLAGS = %x{pkg-config --cflags glib-2.0}.strip
4
- $LGFLAGS = %x{pkg-config --libs glib-2.0}.strip
3
+ glib_cflags = %x{pkg-config --cflags glib-2.0}.strip
4
+ glib_ldflags = %x{pkg-config --libs glib-2.0}.strip
5
5
 
6
- if $CFLAGS.empty?
6
+ if glib_cflags.empty?
7
7
  warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_INCLUDE_DIR env to override.}
8
8
  dirs = ENV.fetch('GLIB_INCLUDE_DIR', '/usr/include/glib-2.0 /usr/lib/glib-2.0/include')
9
- $CFLAGS = dirs.split(/\s+/).map {|dir| "-I#{dir}"}.join(' ')
9
+ glib_cflags = dirs.split(/\s+/).map {|dir| "-I#{dir}"}.join(' ')
10
10
  end
11
11
 
12
- if $LDFLAGS.empty?
12
+ if glib_ldflags.empty?
13
13
  warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_LIB env to override.}
14
14
  libs = ENV.fetch('GLIB_LIB', 'glib-2.0')
15
- $LDFLAGS = libs.split(/\s+/).map {|lib| "-l#{lib}"}.join(' ')
15
+ glib_ldflags = libs.split(/\s+/).map {|lib| "-l#{lib}"}.join(' ')
16
16
  end
17
17
 
18
- dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
18
+ dir = File.expand_path(File.dirname(__FILE__) + '/../dictionaries')
19
+ $CFLAGS = glib_cflags + %Q{ -I/usr/include/libxml2 -DDICTIONARY_DIR='"#{dir}/"'}
20
+ $LDFLAGS = glib_ldflags
19
21
 
20
- headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
21
- if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
22
- create_makefile 'ots'
23
- else
24
- puts "Cannot find libots headers or libraries"
25
- exit 1
26
- end
22
+ find_library('glib-2.0', 'main')
23
+ find_library('xml2', 'main')
24
+
25
+ create_makefile 'ots'
@@ -0,0 +1,185 @@
1
+ /*
2
+ * grader-tc.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+
27
+ #include "grader-tc.h"
28
+
29
+
30
+ /*Grader - Term count algorithm*/
31
+ /*This is non-normelized term frequency algorithm without using inverse document frequency database */
32
+
33
+ #define NUM_KEY_WORDS 100 /* use first n key words only */
34
+
35
+ int
36
+ ots_get_article_word_count (const OtsArticle * Doc)
37
+ {
38
+ GList *li;
39
+ int articleWC;
40
+ articleWC = 0;
41
+
42
+ if (Doc==NULL) return 0;
43
+
44
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
45
+ {
46
+ articleWC += ((OtsSentence *) li->data)->wc;
47
+ }
48
+
49
+ return articleWC;
50
+ }
51
+
52
+
53
+ /*take this line and add each word to the "wordStat" list
54
+ * this list will hold all of the words in the article and the number
55
+ * of times they appeared in the article.
56
+ */
57
+
58
+ static void
59
+ ots_line_add_wordlist(OtsArticle * Doc,const OtsSentence * aLine)
60
+ {
61
+ GList *li;
62
+ if ((aLine==NULL) ||(NULL==Doc)) { return;}
63
+
64
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
65
+ if (li->data && strlen (li->data)) ots_add_wordstat (Doc, (char *)li->data);
66
+
67
+ return;
68
+ }
69
+
70
+ static void
71
+ ots_create_wordlist(OtsArticle * Doc)
72
+ {
73
+ GList *line;
74
+ if (Doc==NULL) return;
75
+
76
+ for (line = (GList *) Doc->lines; line != NULL; line = line->next)
77
+ {
78
+ OtsSentence * aLine=line->data;
79
+ if (aLine)
80
+ ots_line_add_wordlist(Doc,aLine);
81
+ }
82
+ }
83
+
84
+
85
+
86
+
87
+ static int
88
+ keyVal (const int n) /* Ugly , I know */
89
+ {
90
+ if (n == 1) return 3;
91
+ if (n == 2) return 2;
92
+ if (n == 3) return 2;
93
+ if (n == 4) return 2;
94
+ return 1;
95
+ }
96
+
97
+
98
+ static void
99
+ ots_grade_line (GList *impList, OtsSentence * aLine,
100
+ OtsStemRule * rule)
101
+ {
102
+ GList *li;
103
+ GList *di;
104
+ int n;
105
+ char *tmp_stem;
106
+
107
+ if ((aLine==NULL)||(rule==NULL)||(impList==NULL)) return;
108
+
109
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word */
110
+ {
111
+ n = 0;
112
+ tmp_stem = ots_stem_strip ((unsigned char *) li->data, rule);
113
+
114
+ for (di = (GList *) impList;
115
+ ((di != NULL) && (n < NUM_KEY_WORDS)); di = di->next)
116
+ {
117
+ n++;
118
+ if ((NULL!=((OtsWordEntery *) di->data)->stem) && (NULL!=tmp_stem))
119
+ if (0 == strcmp ((((OtsWordEntery *) di->data)->stem), tmp_stem))
120
+ {
121
+ /* debug:
122
+ if (0!=strcmp((((OtsWordEntery *) di->data)->word),li->data))
123
+ printf("[%s][%s] stem[%s]\n",(((OtsWordEntery *) di->data)->word),li->data,tmp);*/
124
+
125
+ aLine->score += (((OtsWordEntery *) di->data)->occ) * keyVal (n);
126
+ }
127
+
128
+ }
129
+
130
+ g_free (tmp_stem);
131
+ }
132
+
133
+ }
134
+
135
+
136
+ void
137
+ ots_create_title_tc(OtsArticle * Doc)
138
+ {
139
+
140
+ char *tmp;
141
+ char *word;
142
+ int i;
143
+ GString *title;
144
+ if (NULL==Doc) return;
145
+
146
+ title=g_string_new(NULL);
147
+
148
+ for (i=0;i<5;i++)
149
+ {
150
+ word = ots_word_in_list(Doc->ImpWords,i);
151
+ if (word) g_string_append(title,word); else break;
152
+ if (i<4) g_string_append(title,",");
153
+ }
154
+
155
+ tmp=title->str;
156
+ if (NULL!=title) g_string_free(title,FALSE);
157
+ Doc->title=tmp;
158
+ }
159
+
160
+
161
+ void
162
+ ots_grade_doc_tc (OtsArticle * Doc)
163
+ {
164
+
165
+ GList *li;
166
+ if (NULL==Doc) return;
167
+ ots_create_wordlist(Doc);
168
+
169
+
170
+ Doc->ImpWords=ots_union_list (Doc->wordStat, Doc->dict); /* subtract from the Article wordlist all the words in the dic file (on , the , is...) */
171
+ Doc->ImpWords=ots_sort_list (Doc->ImpWords); /* sort the list , top 3 is what the article talks about (SARS , virus , cure ... ) */
172
+
173
+ /*to print wordlist: ots_print_wordlist (stdout, Doc->ImpWords);*/
174
+
175
+ if (0 == Doc->lineCount) return;
176
+
177
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
178
+ {
179
+ if (li->data)
180
+ ots_grade_line (Doc->ImpWords, (OtsSentence *) li->data, Doc->stem);
181
+ }
182
+
183
+
184
+ ots_create_title_tc(Doc);
185
+ }
@@ -0,0 +1,64 @@
1
+ /*
2
+ * grader-tc.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_GRADERTC_H
22
+ #define HAVE_GRADERTC_H
23
+
24
+
25
+ #include <glib.h>
26
+ #include "libots.h"
27
+
28
+ G_BEGIN_DECLS
29
+
30
+
31
+ typedef struct
32
+ {
33
+ gchar *word; /* the word */
34
+ gchar *stem; /*stem of the word*/
35
+ gint occ; /* how many times have we seen this word in the text? */
36
+ } OtsWordEntery;
37
+
38
+ /*Word list manipulations*/
39
+ void ots_free_wordlist (GList *aList);
40
+
41
+
42
+
43
+ OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj);
44
+ OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString);
45
+ OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule);
46
+ void ots_free_wordEntery (OtsWordEntery * WC);
47
+
48
+ GList *ots_sort_list (GList* aList);
49
+ GList *ots_union_list (const GList *aLst, const GList * bLst);
50
+
51
+ char *ots_word_in_list (const GList *aList,const int index);
52
+ char *ots_stem_in_list (const GList *aList,const int index);
53
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
54
+
55
+
56
+ /*grader*/
57
+
58
+ void ots_grade_doc_tc (OtsArticle * Doc);
59
+
60
+ G_END_DECLS
61
+
62
+
63
+
64
+ #endif /* HAVE_GRADERTC_H */
@@ -0,0 +1,116 @@
1
+ /*
2
+ * grader-tf.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*Grader - using the Term frequency algorithm. Will give each line a score*/
27
+
28
+
29
+
30
+ OtsWordTF*
31
+ ots_new_OtsWordTF(const char* word,const double tf)
32
+ {
33
+ OtsWordTF* obj=g_new0(OtsWordTF,1);
34
+ if (word!=NULL) obj->word=g_strdup(word);
35
+ obj->tf=tf;
36
+ return obj;
37
+ }
38
+
39
+ void
40
+ ots_free_OtsWordTF(OtsWordTF *obj)
41
+ {
42
+ if (obj!=NULL)
43
+ {
44
+ if (obj->word!=NULL) g_free(obj->word);
45
+ g_free(obj);
46
+ }
47
+ }
48
+
49
+ void
50
+ ots_free_TF_wordlist (GList * aList)
51
+ {
52
+ if (aList != NULL)
53
+ {
54
+ g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL);
55
+ g_list_free(aList);
56
+ }
57
+ }
58
+
59
+
60
+ void
61
+ ots_grade_line_tf (OtsSentence * aLine)
62
+ {
63
+
64
+ return;
65
+ }
66
+
67
+
68
+
69
+ void
70
+ ots_grade_doc_tf (OtsArticle * Doc)
71
+ {
72
+
73
+ GList *li;
74
+
75
+ /*Load tf list*/
76
+ /*Load idf list*/
77
+
78
+ if (0 == Doc->lineCount) return;
79
+
80
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
81
+ {
82
+ ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/);
83
+ }
84
+
85
+ return;
86
+ }
87
+
88
+
89
+ double
90
+ ots_tf_word_score (const double tf,const double idf)
91
+ /*IDF: how rare is word across the collection
92
+ TF: how often is word in doc */
93
+ {
94
+
95
+ return tf*idf;
96
+ }
97
+
98
+ /*
99
+ Determine frequency of query words
100
+ n = (num-of-sentences words appears in)
101
+ N = (total-number-of-sentences)
102
+ f = n/N
103
+ */
104
+
105
+ double
106
+ ots_calc_idf (const int term_count,const int doc_word_count)
107
+ {
108
+ return -log(doc_word_count/term_count);
109
+ }
110
+
111
+ double
112
+ ots_calc_tf (const int term_count,const int doc_word_count)
113
+ {
114
+ if (term_count==0) return 0; else
115
+ return 0.5+0.5*(doc_word_count/term_count);
116
+ }
@@ -0,0 +1,85 @@
1
+ /*
2
+ * grader.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ extern void ots_grade_doc_tc (OtsArticle * Doc);
27
+
28
+ /*Grader driver - will call one of the grading algorithm*/
29
+
30
+
31
+
32
+ void
33
+ ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/
34
+ {
35
+ GList *li;
36
+ GList *first;
37
+ GList *second;
38
+ OtsSentence *first_line=NULL;
39
+
40
+ first = NULL;
41
+ second = NULL;
42
+
43
+ if (Doc==NULL) return;
44
+
45
+ if (Doc->lines!=NULL)
46
+ first_line= ((OtsSentence *) (Doc->lines->data));
47
+ if (NULL!=first_line) first_line->score *= 2; /*first line/title is very important so we increase its score */
48
+
49
+ /*This loop will *1.6 the score of each line that
50
+ starts with \n \n , in other words a new paragraph*/
51
+
52
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
53
+ {
54
+ OtsSentence *aLine = (li->data);
55
+ if (NULL != aLine) /*line is there */
56
+ {
57
+ first = aLine->words; /*first word? */
58
+ if (NULL != first)
59
+ second = first->next; /*second word? */
60
+ if ((NULL != first) && (NULL != second)) /*have content? */
61
+ if (strcmp (first->data, "\n") && strcmp (second->data, "\n")) /*new paragraph? */
62
+ aLine->score *= 1.6;
63
+ }
64
+
65
+ }
66
+
67
+ }
68
+
69
+ /**
70
+ Each grader needs to do:
71
+ 1.give a ->score to each line
72
+ 2.Set the ->title of the document
73
+ **/
74
+
75
+ void
76
+ ots_grade_doc (OtsArticle * Doc)
77
+ {
78
+
79
+ if (Doc==NULL) return;
80
+ ots_grade_doc_tc(Doc); /*Term count*/
81
+
82
+ /* or ots_grade_doc_fc (Doc); Term Frequency */
83
+
84
+ ots_grade_structure (Doc);
85
+ }