wapiti 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.md +8 -0
  3. data/LICENSE +1 -1
  4. data/README.md +39 -95
  5. data/ext/wapiti/bcd.c +1 -1
  6. data/ext/wapiti/extconf.rb +15 -1
  7. data/ext/wapiti/lbfgs.c +6 -6
  8. data/ext/wapiti/model.c +2 -3
  9. data/ext/wapiti/model.h +0 -7
  10. data/ext/wapiti/native.c +89 -239
  11. data/ext/wapiti/native.h +0 -5
  12. data/ext/wapiti/pattern.c +1 -1
  13. data/ext/wapiti/progress.c +19 -44
  14. data/ext/wapiti/progress.h +1 -4
  15. data/ext/wapiti/rprop.c +3 -4
  16. data/ext/wapiti/sgdl1.c +3 -3
  17. data/ext/wapiti/tools.c +36 -30
  18. data/ext/wapiti/tools.h +9 -4
  19. data/ext/wapiti/trainers.c +55 -0
  20. data/ext/wapiti/trainers.h +4 -1
  21. data/lib/wapiti.rb +4 -24
  22. data/lib/wapiti/dataset.rb +162 -0
  23. data/lib/wapiti/errors.rb +0 -4
  24. data/lib/wapiti/log.rb +29 -0
  25. data/lib/wapiti/model.rb +63 -40
  26. data/lib/wapiti/options.rb +66 -29
  27. data/lib/wapiti/sequence.rb +105 -0
  28. data/lib/wapiti/token.rb +74 -0
  29. data/lib/wapiti/version.rb +1 -1
  30. metadata +20 -80
  31. data/.autotest +0 -13
  32. data/.rspec +0 -3
  33. data/.simplecov +0 -3
  34. data/Gemfile +0 -29
  35. data/Rakefile +0 -63
  36. data/ext/wapiti/wapiti.c +0 -410
  37. data/spec/fixtures/ch.mod +0 -18550
  38. data/spec/fixtures/chpattern.txt +0 -52
  39. data/spec/fixtures/chtest.txt +0 -1973
  40. data/spec/fixtures/chtrain.txt +0 -19995
  41. data/spec/fixtures/nppattern.txt +0 -52
  42. data/spec/fixtures/nptest.txt +0 -1973
  43. data/spec/fixtures/nptrain.txt +0 -19995
  44. data/spec/fixtures/pattern.txt +0 -14
  45. data/spec/fixtures/test.txt +0 -60000
  46. data/spec/fixtures/train.txt +0 -1200
  47. data/spec/spec_helper.rb +0 -41
  48. data/spec/wapiti/model_spec.rb +0 -233
  49. data/spec/wapiti/native_spec.rb +0 -11
  50. data/spec/wapiti/options_spec.rb +0 -185
  51. data/spec/wapiti/utility_spec.rb +0 -22
  52. data/wapiti.gemspec +0 -49
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f8e3f42f711858caf64ca6682ea3aea8d9439a02
4
- data.tar.gz: 32569d35d3e6d2af1adf91df7a1a8d7ba996d198
2
+ SHA256:
3
+ metadata.gz: f0357489368d9bbe57ea34e2bee7be8f6ae7542875971acdf48f24b038aebb32
4
+ data.tar.gz: 7267f065c30e82f581942cae7789bc4d71999f7a418c1549b5cd41a8ed4a1b80
5
5
  SHA512:
6
- metadata.gz: 2ed36dbff7978ca5a731bc3381faad7b9fc362dcdd46ce0b7495dceacd6d21458a33bab5250ee71788d4b2e553625d3b39e61b4140ef701498f04742cf10d19c
7
- data.tar.gz: d9e10608342bc351c88cbb61d37af504b983c3e5b973ee2a29919951987ec4d3a3919d98f053990300207820cf59ab297c80a6efb4806060999c0090079b9e39
6
+ metadata.gz: 194656b3d90ed6fedf32a2e1a8dc34cec4eccaeec71098c2499054697c66a3209d9436cfcf66e1b4b43e19e2f2b50f4ad5e2682df9d21f126e812940591de226
7
+ data.tar.gz: 9d365d09193a7c1657b1583331f20d1f985ad0f8c351947c791dd407a163463f4eb21ee6ac74545bd0f61ca6f88ab0ed669aaeefdaefdd03f2543cc22078a4f5
data/HISTORY.md CHANGED
@@ -1,3 +1,11 @@
1
+ 1.0.0 / 2017-12-xx
2
+ ==================
3
+ * Added support for Windows platform
4
+ * Open files only if names are untainted
5
+ * Finalized API
6
+ * Fixed error reporting
7
+ * Removed progress logging
8
+
1
9
  0.1.1 / 2014-02-27
2
10
  ==================
3
11
  * Updated train routine
data/LICENSE CHANGED
@@ -1,5 +1,5 @@
1
1
  Wapiti-Ruby
2
- Copyright 2011-2014 Sylvester Keil. All rights reserved.
2
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
3
3
 
4
4
  Wapiti - A linear-chain CRF tool
5
5
  Copyright 2009-2013 CNRS. All rights reserved.
data/README.md CHANGED
@@ -5,18 +5,14 @@ The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
5
5
  API for sequence segmentation and labelling; it is based on the
6
6
  codebase of [wapiti](http://wapiti.limsi.fr/).
7
7
 
8
- [![Build Status](https://secure.travis-ci.org/inukshuk/wapiti-ruby.png)](http://travis-ci.org/inukshuk/wapiti-ruby)
9
- [![Coverage Status](https://coveralls.io/repos/inukshuk/wapiti-ruby/badge.png?branch=master)](https://coveralls.io/r/inukshuk/wapiti-ruby?branch=master)
8
+ [![Linux Build Status](https://travis-ci.org/inukshuk/wapiti-ruby.svg?branch=master)](https://travis-ci.org/inukshuk/wapiti-ruby)
9
+ [![Windows Build Status](https://ci.appveyor.com/api/projects/status/12rtxe2o8p55g1w6/branch/master?svg=true)](https://ci.appveyor.com/project/inukshuk/wapiti-ruby/branch/master)
10
+ [![Coverage Status](https://coveralls.io/repos/github/inukshuk/wapiti-ruby/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/wapiti-ruby?branch=master)
10
11
 
11
12
  Requirements
12
13
  ------------
13
- Wapiti is written in C and Ruby and requires a compiler with C99
14
- support (e.g., gcc); on GNU/Linux systems it will be fairly easy to install
15
- all necessary packages through your distribution.
16
-
17
- The Wapiti Ruby gem has been confirmed to work with MRI 2.x, 1.9.x, 1.8.7,
18
- and Rubinius.
19
-
14
+ Wapiti is written in C and Ruby and requires a compiler with C99 support;
15
+ it has been confirmed to work on Linux, macOS, and Windows.
20
16
 
21
17
  Quickstart
22
18
  ----------
@@ -29,49 +25,35 @@ Quickstart
29
25
 
30
26
  Using a pattern and training data stored in a file:
31
27
 
32
- model = Wapiti.train('train.txt', :pattern => 'pattern.txt')
33
- => #<Wapiti::Model:0x0000010188f868>
28
+ model = Wapiti.train('train.txt', pattern: 'pattern.txt')
29
+ #=> #<Wapiti::Model:0x0000010188f868>
34
30
  model.labels
35
- => ["B-ADJP", "B-ADVP", "B-CONJP" ...]
31
+ #=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
36
32
  model.save('ch.mod')
37
- => # saves the model as 'ch.mod'
33
+ #=> saves the model as 'ch.mod'
38
34
 
39
- Alternatively, you can pass in the training data as an array; the array
40
- should contain one array for each sequence of training data.
35
+ Alternatively, you can pass in the training data as a `Wapiti::Dataset`;
36
+ this class supports the default text format used by Wapiti as well as
37
+ additiional formats (such as YAML or XML) and an API to make it easier
38
+ to manage data sets used for input and training.
41
39
 
42
- data = []
43
- data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
44
- ...
40
+ data = Wapiti::Dataset.open('chtrain.xml')
45
41
  model = Wapiti.train(data, options)
46
42
 
47
- You can consult the `Wapiti::Options` class for a list of supported
48
- configuration options and algorithms:
49
-
50
- Wapiti::Options.attribute_names
51
- => [:algorithm, :check, :compact, :convergence_window, :development_data,
52
- :jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
53
- :rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
54
- Wapiti::Options.algorithms
55
- => ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
43
+ You can consult the `Wapiti::Options.attribute_names` class for a list of
44
+ supported configuration options and `Wapiti::Options.algorithms` for
45
+ all supported algorithms:
56
46
 
57
47
  Use `#valid?` or `#validate` (which returns error messages) to make sure
58
48
  your configuration is supported by Wapiti.
59
49
 
60
- You can pass options either as an options hash or by adding a block to the
61
- method invocation:
62
-
63
- model = Wapiti::Model.train(data) do |config|
64
- config.pattern = 'pattern.txt'
65
- threads = 4
66
- end
67
-
68
50
  Before saving your model you can use `compact` to reduce the model's size:
69
51
 
70
52
  model.save 'm1.mod'
71
- => # m1.mod file size 1.8M
53
+ #=> m1.mod file size 1.8M
72
54
  model.compact
73
55
  model.save 'm2.mod'
74
- => # m2.mod file size 471K
56
+ #=> m2.mod file size 471K
75
57
 
76
58
 
77
59
  ### Loading existing Models
@@ -80,50 +62,33 @@ Before saving your model you can use `compact` to reduce the model's size:
80
62
 
81
63
  ### Labelling
82
64
 
83
- By calling `#label` on a Model instance you can add labels to your sequence
84
- data:
65
+ By calling `#label` on a Model instance you can add labels to a dataset:
85
66
 
86
67
  model = Wapiti.load('m2.mod')
87
- model.label('test.txt')
88
- => [[["Confidence NN B-NP", "B-NP"], ["in IN B-PP", "B-PP"] ... ]
68
+ input = Wapiti::Dataset.load('chtest.txt')
69
+ output = model.label(input, tagged: true)
89
70
 
90
- The result is an array of sequence arrays; each sequence array consists of
91
- the original token and feature string (when using test data, the final
92
- feature is usually the expected label) and the label calculated by Wapiti.
71
+ The result is a new `Wapiti::Dataset` with the predicted labels for each
72
+ token. If your input data was already tagged, you can compare the input
73
+ and output datasets to evaluate your results:
93
74
 
94
- As with training data, you can pass in data either by filename or as
95
- a Ruby Array:
96
-
97
- model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
98
- => [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
99
- ["pound NN", "I-NP"], [". .", "O"]]]
75
+ output - input
76
+ # => new dataset of output sequences which are tagged differently than expected
100
77
 
101
78
  If you pass a block to `#label` Wapiti will yield each token and the
102
79
  corresponding label:
103
80
 
104
- model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']] do |token, label|
81
+ model.label input do |token, label|
105
82
  [token.downcase, label.downcase]
106
83
  end
107
- => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
108
- ["pound nn", "i-np"], [". .", "o"]]]
109
84
 
110
85
  Note that if you set the *:score* option (either in the Model's `#options` or
111
86
  when calling `#label`), the score for each label will be appended to
112
87
  each token/label tuple as a floating point number or passed as a third
113
88
  argument to the passed-in block.
114
89
 
115
- model.label [['Confidence NN']], :score => true
116
- => [[["Confidence NN", "B-NP", 4.642034838737357]]]
117
-
118
- Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
119
- will append more label and, optionally, score values to each tuple.
120
-
121
- model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
122
- => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
123
-
124
- Note how we also suppressed the output of the token string using the
125
- *:skip_tokens* option.
126
-
90
+ model.label input, score: true
91
+ # => Dataset where each token will include a score
127
92
 
128
93
  ### Statistics
129
94
 
@@ -131,41 +96,20 @@ By setting the *:check* option you can tell Wapiti to keep statistics during
131
96
  the labelling phase (for the statistics to be meaningful you obviously need
132
97
  to provide input data that is already labelled). Wapiti does not reset the
133
98
  counters during consecutive calls to `#label` to allow you to collect
134
- accumulative date; however, you can reset the counters at any time, by calling
135
- `#clear_counters`.
99
+ accumulative stats; however, you can reset the counters at any time, by calling
100
+ `#reset_counters`.
136
101
 
137
102
  After calling `#label` with the *:check* options set and appropriately labelled
138
103
  input, you can access the statistics via `#statistics` (the individual values
139
104
  are also available through the associated attribute readers).
140
105
 
141
- model.label 'test.txt', :check => true
142
- => {:tokens=>{:total=>1896, :errors=>137, :rate=>7.225738396624472},
143
- :sequences=>{:total=>77, :errors=>50, :rate=>64.93506493506494}}
144
-
145
-
146
-
147
- Citing
148
- ------
149
- If you're using Wapiti-Ruby for research purposes, please use the following
150
- citation of the original wapiti package:
151
-
152
- @article{lavergne2010practical,
153
- author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
154
- title = {Practical Very Large Scale {CRFs}},
155
- booktitle = {Proceedings the 48th Annual Meeting of the Association for
156
- Computational Linguistics (ACL)},
157
- month = {July},
158
- year = {2010},
159
- location = {Uppsala, Sweden},
160
- publisher = {Association for Computational Linguistics},
161
- pages = {504--513},
162
- url = {http://www.aclweb.org/anthology/P10-1052}
163
- }
164
-
165
- If you're profiting from any of the Wapiti-Ruby specific features you are
166
- welcome to also refer back to the
167
- [Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
106
+ model.label input, check: true
107
+ model.stats
108
+ => {:token=>{:count=>1896, :errors=>137, :rate=>7.225738396624472},
109
+ :sequence=>{:count=>77, :errors=>50, :rate=>64.93506493506494}}
168
110
 
111
+ For convenience, you can also use the `#check` method, which
112
+ will reset the counters, check your input, and return the stats.
169
113
 
170
114
  Contributing
171
115
  ------------
@@ -183,7 +127,7 @@ example, fix the bug and submit a pull request.
183
127
 
184
128
  License
185
129
  -------
186
- Copyright 2011-2014 Sylvester Keil. All rights reserved.
130
+ Copyright 2011-2018 Sylvester Keil. All rights reserved.
187
131
 
188
132
  Copyright 2009-2013 CNRS. All rights reserved.
189
133
 
data/ext/wapiti/bcd.c CHANGED
@@ -377,7 +377,7 @@ void trn_bcd(mdl_t *mdl) {
377
377
  // And update the model
378
378
  bcd_update(mdl, bcd, o);
379
379
  }
380
- if (!uit_progress(mdl, i, -1.0))
380
+ if (!uit_progress(mdl))
381
381
  break;
382
382
  }
383
383
  // Cleanup memory
@@ -1,6 +1,20 @@
1
1
  require 'mkmf'
2
+ require 'rbconfig'
2
3
 
3
- $CFLAGS << %q{ -std=c99 -W -Wall -Wno-declaration-after-statement -O3 }
4
+ cflags = %w{
5
+ -std=c99
6
+ -W
7
+ -Wall
8
+ -Wno-declaration-after-statement
9
+ -O3
10
+ }
11
+
12
+ case RbConfig::CONFIG['host_os']
13
+ when /^linux/i
14
+ cflags[0] = '-std=gnu99'
15
+ end
16
+
17
+ $CFLAGS << ' ' << cflags.join(' ')
4
18
 
5
19
  have_library('pthread')
6
20
  have_library('m')
data/ext/wapiti/lbfgs.c CHANGED
@@ -104,12 +104,12 @@ void trn_lbfgs(mdl_t *mdl) {
104
104
  uint64_t f;
105
105
  if (fscanf(file, "%"PRIu64, &f) != 1)
106
106
  fatal("1 %s", err);
107
- if (fscanf(file, "%la %la", &xp[f], &gp[f]) != 2)
107
+ if (fscanf(file, "%le %le", &xp[f], &gp[f]) != 2)
108
108
  fatal("2 %s", err);
109
109
  for (uint32_t m = 0; m < M; m++) {
110
- if (fscanf(file, "%la", &s[m][f]) != 1)
110
+ if (fscanf(file, "%le", &s[m][f]) != 1)
111
111
  fatal("3 %s", err);
112
- if (fscanf(file, "%la", &y[m][f]) != 1)
112
+ if (fscanf(file, "%le", &y[m][f]) != 1)
113
113
  fatal("4 %s", err);
114
114
  }
115
115
  }
@@ -271,7 +271,7 @@ void trn_lbfgs(mdl_t *mdl) {
271
271
  memcpy(x, xp, sizeof(double) * F);
272
272
  break;
273
273
  }
274
- if (uit_progress(mdl, k + 1, fx) == false)
274
+ if (uit_progress(mdl) == false)
275
275
  break;
276
276
  // 3rd step: we update the history used for approximating the
277
277
  // inverse of the diagonal of the hessian
@@ -314,9 +314,9 @@ void trn_lbfgs(mdl_t *mdl) {
314
314
  fprintf(file, "#state#0#%"PRIu32"#%"PRIu64"\n", M, F);
315
315
  for (uint64_t f = 0; f < F; f++) {
316
316
  fprintf(file, "%"PRIu64, f);
317
- fprintf(file, " %la %la", xp[f], gp[f]);
317
+ fprintf(file, " %le %le", xp[f], gp[f]);
318
318
  for (uint32_t m = 0; m < M; m++)
319
- fprintf(file, " %la %la", s[m][f], y[m][f]);
319
+ fprintf(file, " %le %le", s[m][f], y[m][f]);
320
320
  fprintf(file, "\n");
321
321
  }
322
322
  fclose(file);
data/ext/wapiti/model.c CHANGED
@@ -74,7 +74,6 @@ mdl_t *mdl_new(rdr_t *rdr) {
74
74
  mdl->train = mdl->devel = NULL;
75
75
  mdl->reader = rdr;
76
76
  mdl->werr = NULL;
77
- mdl->total = 0.0;
78
77
  return mdl;
79
78
  }
80
79
 
@@ -272,7 +271,7 @@ void mdl_save(mdl_t *mdl, FILE *file) {
272
271
  rdr_save(mdl->reader, file);
273
272
  for (uint64_t f = 0; f < mdl->nftr; f++)
274
273
  if (mdl->theta[f] != 0.0)
275
- fprintf(file, "%"PRIu64"=%la\n", f, mdl->theta[f]);
274
+ fprintf(file, "%"PRIu64"=%le\n", f, mdl->theta[f]);
276
275
  }
277
276
 
278
277
  /* mdl_load:
@@ -298,7 +297,7 @@ void mdl_load(mdl_t *mdl, FILE *file) {
298
297
  for (uint64_t i = 0; i < nact; i++) {
299
298
  uint64_t f;
300
299
  double v;
301
- if (fscanf(file, "%"SCNu64"=%la\n", &f, &v) != 2)
300
+ if (fscanf(file, "%"SCNu64"=%le\n", &f, &v) != 2)
302
301
  fatal(err);
303
302
  mdl->theta[f] = v;
304
303
  }
data/ext/wapiti/model.h CHANGED
@@ -30,15 +30,12 @@
30
30
 
31
31
  #include <stddef.h>
32
32
  #include <stdint.h>
33
- #include <sys/time.h>
34
33
 
35
34
  #include "options.h"
36
35
  #include "sequence.h"
37
36
  #include "reader.h"
38
37
  #include "wapiti.h"
39
38
 
40
- typedef struct timeval tms_t;
41
-
42
39
  /* mdl_t:
43
40
  * Represent a linear-chain CRF model. The model contain both unigram and
44
41
  * bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
@@ -86,10 +83,6 @@ struct mdl_s {
86
83
  double *werr; // Window of error rate of last iters
87
84
  uint32_t wcnt; // Number of iters in the window
88
85
  uint32_t wpos; // Position for the next iter
89
-
90
- // Timing
91
- tms_t timer; // start time of last iter
92
- double total; // total training time
93
86
  };
94
87
 
95
88
  mdl_t *mdl_new(rdr_t *rdr);
data/ext/wapiti/native.c CHANGED
@@ -10,43 +10,16 @@
10
10
  #include "quark.h"
11
11
  #include "tools.h"
12
12
  #include "wapiti.h"
13
-
14
13
  #include "native.h"
15
14
 
16
15
  VALUE mWapiti;
17
16
  VALUE mNative;
18
-
19
17
  VALUE cOptions;
20
18
  VALUE cModel;
21
-
19
+ VALUE cArgumentError;
22
20
  VALUE cNativeError;
23
- VALUE cConfigurationError;
24
21
  VALUE cLogger;
25
22
 
26
-
27
- /* --- Forward declarations --- */
28
-
29
- int wapiti_main(int argc, char *argv[argc]);
30
-
31
- void dolabel(mdl_t *mdl);
32
-
33
-
34
- /* --- Utilities --- */
35
-
36
- static const struct {
37
- const char *name;
38
- void (* train)(mdl_t *mdl);
39
- } trn_lst[] = {
40
- {"l-bfgs", trn_lbfgs},
41
- {"sgd-l1", trn_sgdl1},
42
- {"bcd", trn_bcd },
43
- {"rprop", trn_rprop},
44
- {"rprop+", trn_rprop},
45
- {"rprop-", trn_rprop}
46
- };
47
- static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
48
-
49
-
50
23
  /* --- Options Class --- */
51
24
 
52
25
  // Auxiliary Methods
@@ -68,6 +41,14 @@ static void copy_string(char **dst, VALUE rb_string) {
68
41
  memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
69
42
  }
70
43
 
44
+ // Moves a string to the heap. We use this to move default
45
+ // values to the heap during initialization.
46
+ static char *to_heap(const char *string) {
47
+ char* ptr = calloc(strlen(string), sizeof(char));
48
+ memcpy(ptr, string, strlen(string));
49
+ return ptr;
50
+ }
51
+
71
52
 
72
53
  // Constructor / Desctructor
73
54
 
@@ -76,11 +57,11 @@ static void mark_options(opt_t* options __attribute__((__unused__))) {
76
57
  }
77
58
 
78
59
  static void deallocate_options(opt_t* options) {
79
-
80
60
  // free string options
81
61
  if (options->input) { free(options->input); }
82
62
  if (options->output) { free(options->output); }
83
63
  if (options->algo) { free((void*)options->algo); }
64
+ if (options->type) { free((void*)options->type); }
84
65
  if (options->devel) { free(options->devel); }
85
66
  if (options->pattern) { free((void*)options->pattern); }
86
67
 
@@ -101,21 +82,20 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
101
82
  options->maxiter = INT_MAX;
102
83
  }
103
84
 
104
- // copy the default algorithm name to the heap so that all options strings
105
- // are on the heap
106
- char* tmp = calloc(strlen(options->algo), sizeof(char));
107
- memcpy(tmp, options->algo, strlen(options->algo));
108
- options->algo = tmp;
85
+ // Copy default algorithm and type name to the heap
86
+ // so that all options strings are on the heap.
87
+ options->algo = to_heap(options->algo);
88
+ options->type = to_heap(options->type);
109
89
 
110
90
  if (argc > 1) {
111
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
91
+ rb_raise(cArgumentError,
112
92
  "wrong number of arguments (%d for 0..1)", argc);
113
93
  }
114
94
 
115
95
  // set defaults
116
96
  if (argc) {
117
97
  Check_Type(argv[0], T_HASH);
118
- (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
98
+ (void)rb_funcall(self, rb_intern("update!"), 1, argv[0]);
119
99
  }
120
100
 
121
101
  // yield self if block_given?
@@ -431,7 +411,6 @@ static VALUE options_model(VALUE self) {
431
411
  static VALUE options_set_model(VALUE self, VALUE rb_string) {
432
412
  opt_t *options = get_options(self);
433
413
  copy_string(&(options->model), rb_string);
434
-
435
414
  return rb_string;
436
415
  }
437
416
 
@@ -443,19 +422,17 @@ static VALUE options_algorithm(VALUE self) {
443
422
  static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
444
423
  opt_t *options = get_options(self);
445
424
  copy_string((char**)&(options->algo), rb_string);
446
-
447
425
  return rb_string;
448
426
  }
449
427
 
450
- static VALUE options_development_data(VALUE self) {
451
- char *development_data = get_options(self)->devel;
452
- return rb_str_new2(development_data ? development_data : "");
428
+ static VALUE options_type(VALUE self) {
429
+ const char *type = get_options(self)->type;
430
+ return rb_str_new2(type ? type : "");
453
431
  }
454
432
 
455
- static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
433
+ static VALUE options_set_type(VALUE self, VALUE rb_string) {
456
434
  opt_t *options = get_options(self);
457
- copy_string(&(options->devel), rb_string);
458
-
435
+ copy_string((char**)&(options->type), rb_string);
459
436
  return rb_string;
460
437
  }
461
438
 
@@ -565,11 +542,8 @@ void Init_options() {
565
542
  rb_define_alias(cOptions, "algo", "algorithm");
566
543
  rb_define_alias(cOptions, "algo=", "algorithm=");
567
544
 
568
- rb_define_method(cOptions, "development_data", options_development_data, 0);
569
- rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
570
-
571
- rb_define_alias(cOptions, "devel", "development_data");
572
- rb_define_alias(cOptions, "devel=", "development_data=");
545
+ rb_define_method(cOptions, "type", options_type, 0);
546
+ rb_define_method(cOptions, "type=", options_set_type, 1);
573
547
 
574
548
  rb_define_method(cOptions, "clip", options_clip, 0);
575
549
  rb_define_method(cOptions, "clip=", options_set_clip, 1);
@@ -640,7 +614,7 @@ static VALUE allocate_model(VALUE self) {
640
614
 
641
615
  static VALUE model_set_options(VALUE self, VALUE rb_options) {
642
616
  if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
643
- rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
617
+ rb_raise(cArgumentError, "argument must be a Wapiti::Options instance");
644
618
  }
645
619
 
646
620
  mdl_t *model = get_model(self);
@@ -661,22 +635,20 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
661
635
  VALUE options;
662
636
 
663
637
  if (argc > 1) {
664
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
638
+ rb_raise(cArgumentError,
665
639
  "wrong number of arguments (%d for 0..1)", argc);
666
640
  }
667
641
 
668
642
  if (argc) {
669
643
  if (TYPE(argv[0]) == T_HASH) {
670
644
  options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
671
- }
672
- else {
645
+ } else {
673
646
  if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
674
- rb_raise(cNativeError, "argument must be a hash or an options instance");
647
+ rb_raise(cArgumentError, "argument must be a hash or an options instance");
675
648
  }
676
649
  options = argv[0];
677
650
  }
678
- }
679
- else {
651
+ } else {
680
652
  options = rb_funcall(cOptions, rb_intern("new"), 0);
681
653
  }
682
654
 
@@ -693,7 +665,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
693
665
  }
694
666
 
695
667
  // initialize counters
696
- rb_funcall(self, rb_intern("clear_counters"), 0);
668
+ rb_funcall(self, rb_intern("reset_counters"), 0);
697
669
 
698
670
  return self;
699
671
  }
@@ -713,10 +685,6 @@ static VALUE model_nftr(VALUE self) {
713
685
  return INT2FIX(get_model(self)->nftr);
714
686
  }
715
687
 
716
- static VALUE model_total(VALUE self) {
717
- return rb_float_new(get_model(self)->total);
718
- }
719
-
720
688
 
721
689
  // Instance methods
722
690
 
@@ -738,7 +706,7 @@ static VALUE model_compact(VALUE self) {
738
706
  // otherwise uses the passed-in argument as the Model's path.
739
707
  static VALUE model_save(int argc, VALUE *argv, VALUE self) {
740
708
  if (argc > 1) {
741
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
709
+ rb_raise(cArgumentError,
742
710
  "wrong number of arguments (%d for 0..1)", argc);
743
711
  }
744
712
 
@@ -751,17 +719,13 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
751
719
  }
752
720
 
753
721
  // open the output file
754
- FILE *file = 0;
755
722
  VALUE path = rb_ivar_get(self, rb_intern("@path"));
756
723
 
757
724
  if (NIL_P(path)) {
758
- rb_raise(cNativeError, "failed to save model: no path given");
759
- }
760
-
761
- if (!(file = fopen(StringValueCStr(path), "w"))) {
762
- rb_raise(cNativeError, "failed to save model: failed to open model file");
725
+ fatal("failed to save model: no path given");
763
726
  }
764
727
 
728
+ FILE *file = ufopen(path, "w");
765
729
  mdl_save(model, file);
766
730
  fclose(file);
767
731
 
@@ -770,7 +734,7 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
770
734
 
771
735
  static VALUE model_load(int argc, VALUE *argv, VALUE self) {
772
736
  if (argc > 1) {
773
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
737
+ rb_raise(cArgumentError,
774
738
  "wrong number of arguments (%d for 0..1)", argc);
775
739
  }
776
740
 
@@ -783,17 +747,13 @@ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
783
747
  }
784
748
 
785
749
  // open the model file
786
- FILE *file = 0;
787
750
  VALUE path = rb_ivar_get(self, rb_intern("@path"));
788
751
 
789
752
  if (NIL_P(path)) {
790
- rb_raise(cNativeError, "failed to load model: no path given");
791
- }
792
-
793
- if (!(file = fopen(StringValueCStr(path), "r"))) {
794
- rb_raise(cNativeError, "failed to load model: failed to open model file");
753
+ fatal("failed to load model: no path given");
795
754
  }
796
755
 
756
+ FILE *file = ufopen(path, "r");
797
757
  mdl_load(model, file);
798
758
  fclose(file);
799
759
 
@@ -849,31 +809,44 @@ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
849
809
  return dat;
850
810
  }
851
811
 
812
+ static dat_t *ld_dat(rdr_t *reader, VALUE data, bool labelled) {
813
+ FILE *file;
814
+ dat_t *dat = (dat_t*)0;
852
815
 
853
- static VALUE model_train(VALUE self, VALUE data) {
816
+ switch (TYPE(data)) {
817
+ case T_STRING:
818
+ file = ufopen(data, "r");
819
+ dat = rdr_readdat(reader, file, labelled);
820
+ fclose(file);
821
+ break;
854
822
 
855
- mdl_t* model = get_model(self);
823
+ case T_ARRAY:
824
+ dat = to_dat(reader, data, labelled);
825
+ break;
856
826
 
857
- uint32_t trn;
858
- for (trn = 0; trn < trn_cnt; trn++) {
859
- if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
827
+ default:
828
+ fatal("invalid data type (expected instance of String or Array)");
860
829
  }
861
830
 
862
- if (trn == trn_cnt) {
863
- rb_raise(cNativeError,
864
- "failed to train model: unknown algorithm '%s'", model->opt->algo);
865
- }
831
+ return dat;
832
+ }
833
+
866
834
 
835
+ static VALUE model_train(VALUE self, VALUE train, VALUE devel) {
867
836
  FILE *file;
837
+ mdl_t *model = get_model(self);
838
+ trn_t trn = trn_get(model->opt->algo);
839
+ model->type = typ_get(model->opt->type);
868
840
 
869
841
  // Load the pattern file. This will unlock the database if previously
870
842
  // locked by loading a model.
871
843
  if (model->opt->pattern) {
844
+ info("load patterns");
872
845
  file = fopen(model->opt->pattern, "r");
873
846
 
874
847
  if (!file) {
875
- rb_raise(cNativeError,
876
- "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
848
+ pfatal("failed to train model: failed to load pattern file '%s'",
849
+ model->opt->pattern);
877
850
  }
878
851
 
879
852
  rdr_loadpat(model->reader, file);
@@ -886,58 +859,45 @@ static VALUE model_train(VALUE self, VALUE data) {
886
859
  // Load the training data. When this is done we lock the quarks as we
887
860
  // don't want to put in the model, informations present only in the
888
861
  // development set.
889
-
890
- switch (TYPE(data)) {
891
- case T_STRING:
892
- if (!(file = fopen(StringValuePtr(data), "r"))) {
893
- rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
894
- }
895
-
896
- model->train = rdr_readdat(model->reader, file, true);
897
- fclose(file);
898
-
899
- break;
900
- case T_ARRAY:
901
- model->train = to_dat(model->reader, data, true);
902
-
903
- break;
904
- default:
905
- rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
906
- }
862
+ model->train = ld_dat(model->reader, train, true);
907
863
 
908
864
  qrk_lock(model->reader->lbl, true);
909
865
  qrk_lock(model->reader->obs, true);
910
866
 
911
867
  if (!model->train || model->train->nseq == 0) {
912
- rb_raise(cNativeError, "failed to train model: no training data loaded");
868
+ fatal("failed to train model: no training data loaded");
913
869
  }
914
870
 
915
871
  // If present, load the development set in the model. If not specified,
916
872
  // the training dataset will be used instead.
917
- if (model->opt->devel) {
918
- if (!(file = fopen(model->opt->devel, "r"))) {
919
- rb_raise(cNativeError,
920
- "failed to train model: cannot open development file '%s'", model->opt->devel);
921
- }
922
-
923
- model->devel = rdr_readdat(model->reader, file, true);
924
- fclose(file);
873
+ if (TYPE(devel) != T_NIL) {
874
+ model->devel = ld_dat(model->reader, devel, true);
925
875
  }
926
876
 
927
- // Initialize the model. If a previous model was loaded, this will be
928
- // just a resync, else the model structure will be created.
929
- // rb_funcall(self, rb_intern("sync"), 0);
877
+ // Initialize the model. If a previous model was loaded, this will be
878
+ // just a resync, else the model structure will be created.
879
+ info((model->theta == NULL) ? "initialize model" : "re-sync model");
930
880
  mdl_sync(model);
931
881
 
932
- // Train the model.
882
+ info("nb train: %"PRIu32"", model->train->nseq);
883
+ if (model->devel != NULL)
884
+ info("nb devel: %"PRIu32"", model->devel->nseq);
885
+ info("nb labels: %"PRIu32"", model->nlbl);
886
+ info("nb blocks: %"PRIu64"", model->nobs);
887
+ info("nb features: %"PRIu64"", model->nftr);
888
+
889
+ info("training model with %s", model->opt->algo);
933
890
  uit_setup(model);
934
- trn_lst[trn].train(model);
891
+ trn(model);
935
892
  uit_cleanup(model);
936
893
 
937
- // If requested compact the model.
938
894
  if (model->opt->compact) {
939
- // rb_funcall(self, rb_intern("compact"), 0);
895
+ const uint64_t O = model->nobs;
896
+ const uint64_t F = model->nftr;
897
+ info("compacting model");
940
898
  mdl_compact(model);
899
+ info("%8"PRIu64" observations removed", O - model->nobs);
900
+ info("%8"PRIu64" features removed", F - model->nftr);
941
901
  }
942
902
 
943
903
  return self;
@@ -980,8 +940,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
980
940
 
981
941
  if (N == 1) {
982
942
  tag_viterbi(model, seq, out, scs, psc);
983
- }
984
- else {
943
+ } else {
985
944
  tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
986
945
  }
987
946
 
@@ -993,16 +952,13 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
993
952
  if (!model->opt->label) {
994
953
  VALUE token = rb_str_new2(raw->lines[t]);
995
954
 
996
- #ifdef HAVE_RUBY_ENCODING_H
997
955
  int enc = rb_enc_find_index("UTF-8");
998
956
  rb_enc_associate_index(token, enc);
999
- #endif
1000
957
 
1001
958
  rb_ary_push(tokens, token);
1002
959
  }
1003
960
 
1004
961
  for (n = 0; n < N; ++n) {
1005
-
1006
962
  uint64_t lbl = out[t * N + n];
1007
963
  rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1008
964
 
@@ -1010,7 +966,6 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
1010
966
  if (model->opt->outsc) {
1011
967
  rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1012
968
  }
1013
-
1014
969
  }
1015
970
 
1016
971
  // yield token/label pair to block if given
@@ -1020,9 +975,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
1020
975
 
1021
976
  rb_ary_push(sequence, tokens);
1022
977
 
1023
-
1024
978
  // TODO output sequence score: scs[n] (float)
1025
-
1026
979
  }
1027
980
 
1028
981
  // Statistics
@@ -1036,8 +989,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
1036
989
  if (seq->pos[t].lbl != out[t * N]) {
1037
990
  terr++;
1038
991
  err = 1;
1039
- }
1040
- else {
992
+ } else {
1041
993
  stat[2][out[t * N]]++;
1042
994
  }
1043
995
  }
@@ -1053,10 +1005,8 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
1053
1005
 
1054
1006
  serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1055
1007
  rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1056
-
1057
1008
  }
1058
1009
 
1059
-
1060
1010
  // Cleanup memory used for this sequence
1061
1011
  xfree(scs);
1062
1012
  xfree(psc);
@@ -1090,7 +1040,6 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1090
1040
  for (j = 0; j < k; ++j) {
1091
1041
  VALUE line = rb_ary_entry(sequence, j);
1092
1042
  Check_Type(line, T_STRING);
1093
-
1094
1043
  raw->lines[j] = StringValueCStr(line);
1095
1044
  }
1096
1045
 
@@ -1103,13 +1052,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1103
1052
  }
1104
1053
 
1105
1054
  static VALUE decode_sequence_file(VALUE self, VALUE path) {
1106
- Check_Type(path, T_STRING);
1107
- FILE *file;
1108
-
1109
- if (!(file = fopen(StringValueCStr(path), "r"))) {
1110
- rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1111
- }
1112
-
1055
+ FILE *file = ufopen(path, "r");
1113
1056
  mdl_t *model = get_model(self);
1114
1057
  raw_t *raw;
1115
1058
 
@@ -1119,7 +1062,6 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1119
1062
  // to take care of not discarding the raw input as we want to send it
1120
1063
  // back to the output with the additional predicted labels.
1121
1064
  while (!feof(file)) {
1122
-
1123
1065
  // So, first read an input sequence keeping the raw_t object
1124
1066
  // available, and label it with Viterbi.
1125
1067
  if ((raw = rdr_readraw(model->reader, file)) == 0) {
@@ -1133,12 +1075,12 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1133
1075
  return result;
1134
1076
  }
1135
1077
 
1136
- // cal-seq:
1078
+ // call-seq:
1137
1079
  // m.label(tokens, options = {}) # => array of labelled tokens
1138
1080
  // m.label(filename, options = {}) # => array of labelled tokens
1139
1081
  //
1140
1082
  static VALUE model_label(VALUE self, VALUE data) {
1141
- VALUE result;
1083
+ VALUE result = (VALUE)0;
1142
1084
 
1143
1085
  switch (TYPE(data)) {
1144
1086
  case T_STRING:
@@ -1148,7 +1090,7 @@ static VALUE model_label(VALUE self, VALUE data) {
1148
1090
  result = decode_sequence_array(self, data);
1149
1091
  break;
1150
1092
  default:
1151
- rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1093
+ fatal("failed to label data: invalid data (expected type String or Array)");
1152
1094
  }
1153
1095
 
1154
1096
  return result;
@@ -1157,125 +1099,33 @@ static VALUE model_label(VALUE self, VALUE data) {
1157
1099
  static void Init_model() {
1158
1100
  cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1159
1101
  rb_define_alloc_func(cModel, allocate_model);
1160
-
1161
- rb_define_method(cModel, "initialize", initialize_model, -1);
1162
-
1163
1102
  rb_define_attr(cModel, "options", 1, 0);
1164
1103
 
1165
-
1104
+ rb_define_method(cModel, "initialize", initialize_model, -1);
1166
1105
  rb_define_method(cModel, "nlbl", model_nlbl, 0);
1167
1106
  rb_define_method(cModel, "labels", model_labels, 0);
1168
-
1169
1107
  rb_define_method(cModel, "nobs", model_nobs, 0);
1170
1108
  rb_define_alias(cModel, "observations", "nobs");
1171
-
1172
1109
  rb_define_method(cModel, "nftr", model_nftr, 0);
1173
1110
  rb_define_alias(cModel, "features", "nftr");
1174
-
1175
- rb_define_method(cModel, "total", model_total, 0);
1176
-
1177
1111
  rb_define_method(cModel, "sync", model_sync, 0);
1178
1112
  rb_define_method(cModel, "compact", model_compact, 0);
1179
1113
  rb_define_method(cModel, "save", model_save, -1);
1180
1114
  rb_define_method(cModel, "load", model_load, -1);
1181
-
1182
- rb_define_method(cModel, "train", model_train, 1);
1115
+ rb_define_method(cModel, "train", model_train, 2);
1183
1116
  rb_define_method(cModel, "label", model_label, 1);
1184
1117
  }
1185
1118
 
1186
- /* --- Top-Level Utility Methods --- */
1187
-
1188
-
1189
- static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1190
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1191
- rb_raise(cNativeError, "argument must be a native options instance");
1192
- }
1193
-
1194
- opt_t *options = get_options(rb_options);
1195
-
1196
- if (options->mode != 1) {
1197
- rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1198
- }
1199
-
1200
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1201
- model->opt = options;
1202
-
1203
- dolabel(model);
1204
-
1205
- mdl_free(model);
1206
-
1207
- return Qnil;
1208
- }
1209
-
1210
- #if defined EXTRA
1211
- static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1212
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1213
- rb_raise(cNativeError, "argument must be a native options instance");
1214
- }
1215
-
1216
- opt_t *options = get_options(rb_options);
1217
-
1218
- if (options->mode != 2) {
1219
- rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1220
- }
1221
-
1222
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1223
- model->opt = options;
1224
-
1225
- dodump(model);
1226
-
1227
- mdl_free(model);
1228
-
1229
- return Qnil;
1230
- }
1231
-
1232
- // This function is a proxy for Wapiti's main entry point.
1233
- static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
1234
- int result = -1, argc = 0;
1235
- char **ap, *argv[18], *input, *tmp;
1236
-
1237
- Check_Type(arguments, T_STRING);
1238
- tmp = StringValueCStr(arguments);
1239
-
1240
- // allocate space for argument vector
1241
- input = (char*)malloc(strlen(tmp) + 8);
1242
-
1243
- // prepend command name
1244
- strncpy(input, "wapiti ", 8);
1245
- strncat(input, tmp, strlen(input) - 8);
1246
-
1247
- // remember allocation pointer
1248
- tmp = input;
1249
-
1250
- // turn input string into argument vector (using
1251
- // only the first seventeen tokens from input)
1252
- for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1253
- if ((**ap != '\0') && (++ap >= &argv[18])) break;
1254
- }
1255
-
1256
- // call main entry point
1257
- result = wapiti_main(argc, argv);
1258
-
1259
- // free allocated memory
1260
- free(tmp);
1261
-
1262
- return INT2FIX(result);
1263
- }
1264
- #endif
1265
-
1266
1119
  /* --- Wapiti Extension Entry Point --- */
1267
1120
 
1268
1121
  void Init_native() {
1269
1122
  mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1270
1123
  mNative = rb_define_module_under(mWapiti, "Native");
1271
1124
 
1125
+ cArgumentError = rb_const_get(rb_mKernel, rb_intern("ArgumentError"));
1272
1126
  cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1273
- cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1274
1127
  cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1275
1128
 
1276
- rb_define_singleton_method(mNative, "label", label, 1);
1277
- // rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1278
-
1279
1129
  rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1280
1130
 
1281
1131
  Init_options();