ariel 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/LICENSE +21 -0
  2. data/README +98 -0
  3. data/bin/ariel +56 -0
  4. data/examples/google_calculator/labeled/1 +43 -0
  5. data/examples/google_calculator/labeled/2 +41 -0
  6. data/examples/google_calculator/labeled/3 +41 -0
  7. data/examples/google_calculator/structure.rb +12 -0
  8. data/examples/google_calculator/structure.yaml +46 -0
  9. data/examples/google_calculator/unlabeled/1 +43 -0
  10. data/examples/google_calculator/unlabeled/2 +43 -0
  11. data/examples/raa/labeled/highline.html +135 -0
  12. data/examples/raa/labeled/mongrel.html +168 -0
  13. data/examples/raa/structure.rb +17 -0
  14. data/examples/raa/structure.yaml +183 -0
  15. data/examples/raa/unlabeled/pdf-writer.html +175 -0
  16. data/lib/ariel/candidate_selector.rb +94 -0
  17. data/lib/ariel/example_document_loader.rb +59 -0
  18. data/lib/ariel/extracted_node.rb +20 -0
  19. data/lib/ariel/label_utils.rb +71 -0
  20. data/lib/ariel/learner.rb +237 -0
  21. data/lib/ariel/node_like.rb +26 -0
  22. data/lib/ariel/rule.rb +112 -0
  23. data/lib/ariel/rule_set.rb +34 -0
  24. data/lib/ariel/structure_node.rb +75 -0
  25. data/lib/ariel/token.rb +68 -0
  26. data/lib/ariel/token_stream.rb +240 -0
  27. data/lib/ariel/wildcards.rb +33 -0
  28. data/lib/ariel.rb +69 -0
  29. data/test/ariel_test_case.rb +15 -0
  30. data/test/fixtures.rb +43 -0
  31. data/test/specs/token_spec.rb +65 -0
  32. data/test/specs/token_stream_spec.rb +43 -0
  33. data/test/specs/wildcards_spec.rb +26 -0
  34. data/test/test_candidate_selector.rb +58 -0
  35. data/test/test_example_document_loader.rb +7 -0
  36. data/test/test_label_utils.rb +15 -0
  37. data/test/test_learner.rb +38 -0
  38. data/test/test_rule.rb +38 -0
  39. data/test/test_structure_node.rb +81 -0
  40. data/test/test_token.rb +16 -0
  41. data/test/test_token_stream.rb +82 -0
  42. data/test/test_wildcards.rb +18 -0
  43. metadata +103 -0
@@ -0,0 +1,168 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
2
+ <html>
3
+ <head>
4
+ <base href="http://raa.ruby-lang.org/">
5
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
6
+ <meta name="Author" content="ruby-lang.org">
7
+ <meta http-equiv="content-style-type" content="text/css">
8
+ <link rev="made" href="mailto:raa-admin@ruby-lang.org">
9
+ <link rel="home" href="http://raa.ruby-lang.org/">
10
+ <link rel="index" href="index.html">
11
+ <link rel="search" href="search.rhtml">
12
+ <link rel="glossary" href="all.html">
13
+ <link rel="SHORTCUT ICON" href="/favicon.ico">
14
+ <link rel="stylesheet" href="raa.css" type="text/css" media="all">
15
+
16
+ <title>RAA - <l:name>mongrel</l:name></title>
17
+ </head>
18
+
19
+ <body>
20
+ <div class="header">
21
+ <h1>RAA - mongrel</h1>
22
+ </div>
23
+
24
+ <form method="get" action="search.rhtml">
25
+ <div class="header-searchbox">
26
+ <input name="search" type="text" size="20" maxlength="63"><input type="submit" value="Search"><br>
27
+ <a href="index.html#search">advanced search</a>
28
+ </div>
29
+ </form>
30
+
31
+
32
+ <p class="caption">
33
+ mongrel / <l:current_version>0.3.12</l:current_version>
34
+ </p>
35
+
36
+ <table class="entry">
37
+
38
+ <tr><th>Short description: </th>
39
+ <td><l:short_description>Fast HTTP 1.1 library and server for Ruby web applications.</l:short_description></td>
40
+ </tr>
41
+ <tr><th>Category: </th>
42
+ <td><l:category><a href="cat.rhtml?category_major=Library">Library</a>/<a href="cat.rhtml?category_major=Library;category_minor=Web">Web</a></l:category></td>
43
+ </tr>
44
+ <tr><th>Status: </th>
45
+ <td>beta</td>
46
+ </tr>
47
+ <tr><th>Created: </th>
48
+ <td>2006-02-12 21:12:33 GMT</td>
49
+ </tr>
50
+ <tr>
51
+ <th>Last update: </th>
52
+ <td>2006-03-30 10:42:09 GMT</td>
53
+ </tr>
54
+ <tr><th>Owner: </th>
55
+ <td><a href="mailto:zedshaw@zedshaw.com"><l:owner>Zed A. Shaw</l:owner></a>
56
+ (<a href="owner.rhtml?id=1821">Projects of this owner</a>)</td>
57
+ </tr>
58
+ <tr><th>Homepage: </th>
59
+ <td><a href="http://mongrel.rubyforge.org/"><l:homepage>http://mongrel.rubyforge.org/</l:homepage></a></td>
60
+ </tr>
61
+ <tr><th>Download: </th>
62
+ <td>
63
+ <a href="http://rubyforge.org/frs/?group_id=1306">http://rubyforge.org/frs/?group_id=1306</a>
64
+
65
+ </td>
66
+ </tr>
67
+
68
+ <tr><th>License: </th>
69
+ <td><l:license>LGPL</l:license></td>
70
+ </tr>
71
+ <tr><th>Dependency: </th>
72
+ <td colspan='5'>
73
+ <table>
74
+
75
+ <tr width="100%">
76
+ <td nowrap>Requires:</td>
77
+ <td nowrap>
78
+
79
+ <a href="project/daemons/0.4.2">daemons/0.4.2</a>(*)
80
+
81
+ </td>
82
+ <td width="100%">Used for Ruby on Rails support</td>
83
+ </tr>
84
+
85
+ <tr width="100%">
86
+ <td nowrap>Requires:</td>
87
+ <td nowrap>
88
+
89
+ <a href="project/camping/">camping/1.2</a>(+)
90
+
91
+ </td>
92
+ <td width="100%">Optional if you want to run the examples for Camping.</td>
93
+ </tr>
94
+
95
+ <tr width="100%">
96
+ <td nowrap>Requires:</td>
97
+ <td nowrap>
98
+
99
+ <a href="project/rails/">rails/1.0</a>(+)
100
+
101
+ </td>
102
+ <td width="100%">Optional if you want the mongrel_rails runner.</td>
103
+ </tr>
104
+
105
+ <tr width="100%">
106
+ <td nowrap>Requires:</td>
107
+ <td nowrap>
108
+
109
+ <a href="project/win32-service/0.5.0">win32-service/0.5.0</a>(*)
110
+
111
+ </td>
112
+ <td width="100%">For win32-service stuff. Mongrel project provides a gem.</td>
113
+ </tr>
114
+
115
+ <tr><td colspan="3">(*): newer version exists</td></tr>
116
+
117
+ <tr><td colspan="3">(+): no such version</td></tr>
118
+
119
+ </table>
120
+ </td>
121
+ </tr>
122
+ <tr><th>Description: </th>
123
+ <td><p>Mongrel is a fast HTTP library and server for Ruby that is intended for hosting Ruby web applications of any kind using plain HTTP rather than FastCGI or SCGI. It is framework agnostic and already supports Ruby On Rails, Og+Nitro, and Camping frameworks. Runs on any POSIX system and Win32 (including service support under Win32). It also supports a complete plugin system based on RubyGems called GemPlugins.</p>
124
+ </td>
125
+ </tr>
126
+
127
+ <tr><th>Versions: </th>
128
+ <td>
129
+ <l:version_history>[<a href="project/mongrel/0.3.12">0.3.12</a> (2006-03-30)]
130
+
131
+ [<a href="project/mongrel/0.3.11">0.3.11</a> (2006-03-15)]
132
+
133
+ [<a href="project/mongrel/0.3.10">0.3.10</a> (2006-03-12)]
134
+
135
+ [<a href="project/mongrel/0.3.9">0.3.9</a> (2006-03-06)]
136
+
137
+ [<a href="project/mongrel/0.3.8">0.3.8</a> (2006-03-04)]
138
+
139
+ [<a href="project/mongrel/0.3.6">0.3.6</a> (2006-02-23)]
140
+
141
+ [<a href="project/mongrel/0.3.2">0.3.2</a> (2006-02-13)]
142
+
143
+ [<a href="project/mongrel/0.3.1">0.3.1</a> (2006-02-12)]</l:version_history>
144
+
145
+ </td>
146
+ </tr>
147
+
148
+ </table>
149
+
150
+ <p class="caption">
151
+ <a href="list.rhtml?name=mongrel">Edit this project (for project owner)</a>
152
+ </p>
153
+
154
+ <p class="caption">
155
+ <a href="index.html">back to RAA top</a>
156
+ </p>
157
+
158
+ <div class="footer">
159
+ <hr>
160
+ <address>
161
+ For all questions or comments, or if you have any inquiries about this page, contact <a title="Send Feedback for RAA" href="mailto:raa-admin@ruby-lang.org">raa-admin@ruby-lang.org</a>.
162
+ </address>
163
+ </div>
164
+
165
+ </body>
166
+ </html>
167
+
168
+
@@ -0,0 +1,17 @@
1
+ require 'ariel'
2
+ require 'yaml'
3
+
4
+ structure = Ariel::StructureNode.new do |r|
5
+ r.item :name
6
+ r.item :current_version
7
+ r.item :short_description
8
+ r.item :category
9
+ r.item :owner
10
+ r.item :homepage
11
+ r.item :license
12
+ r.item :version_history
13
+ end
14
+
15
+ File.open('structure.yaml', 'wb') do |file|
16
+ YAML.dump structure, file
17
+ end
@@ -0,0 +1,183 @@
1
+ --- &id001 !ruby/object:Ariel::StructureNode
2
+ children:
3
+ :version_history: !ruby/object:Ariel::StructureNode
4
+ children: {}
5
+
6
+ meta: !ruby/object:OpenStruct
7
+ table:
8
+ :name: :version_history
9
+ :node_type: :not_list
10
+ parent: *id001
11
+ ruleset: !ruby/object:Ariel::RuleSet
12
+ end_rules:
13
+ - !ruby/object:Ariel::Rule
14
+ direction: :back
15
+ landmarks:
16
+ - - </td>
17
+ start_rules:
18
+ - !ruby/object:Ariel::Rule
19
+ direction: :forward
20
+ landmarks:
21
+ - - <td>
22
+ - - Versions
23
+ - - <td>
24
+ :short_description: !ruby/object:Ariel::StructureNode
25
+ children: {}
26
+
27
+ meta: !ruby/object:OpenStruct
28
+ table:
29
+ :name: :short_description
30
+ :node_type: :not_list
31
+ parent: *id001
32
+ ruleset: !ruby/object:Ariel::RuleSet
33
+ end_rules:
34
+ - !ruby/object:Ariel::Rule
35
+ direction: :back
36
+ landmarks:
37
+ - - </td>
38
+ - - Category
39
+ - - </td>
40
+ start_rules:
41
+ - !ruby/object:Ariel::Rule
42
+ direction: :forward
43
+ landmarks:
44
+ - - <td>
45
+ :current_version: !ruby/object:Ariel::StructureNode
46
+ children: {}
47
+
48
+ meta: !ruby/object:OpenStruct
49
+ table:
50
+ :name: :current_version
51
+ :node_type: :not_list
52
+ parent: *id001
53
+ ruleset: !ruby/object:Ariel::RuleSet
54
+ end_rules:
55
+ - !ruby/object:Ariel::Rule
56
+ direction: :back
57
+ landmarks:
58
+ - - </p>
59
+ - - table
60
+ - - </p>
61
+ start_rules:
62
+ - !ruby/object:Ariel::Rule
63
+ direction: :forward
64
+ landmarks:
65
+ - - /
66
+ - - caption
67
+ - - /
68
+ :homepage: !ruby/object:Ariel::StructureNode
69
+ children: {}
70
+
71
+ meta: !ruby/object:OpenStruct
72
+ table:
73
+ :name: :homepage
74
+ :node_type: :not_list
75
+ parent: *id001
76
+ ruleset: !ruby/object:Ariel::RuleSet
77
+ end_rules:
78
+ - !ruby/object:Ariel::Rule
79
+ direction: :back
80
+ landmarks:
81
+ - - </a>
82
+ - - Download
83
+ - - </a>
84
+ start_rules:
85
+ - !ruby/object:Ariel::Rule
86
+ direction: :forward
87
+ landmarks:
88
+ - - ">"
89
+ - - rubyforge
90
+ - - ">"
91
+ :category: !ruby/object:Ariel::StructureNode
92
+ children: {}
93
+
94
+ meta: !ruby/object:OpenStruct
95
+ table:
96
+ :name: :category
97
+ :node_type: :not_list
98
+ parent: *id001
99
+ ruleset: !ruby/object:Ariel::RuleSet
100
+ end_rules:
101
+ - !ruby/object:Ariel::Rule
102
+ direction: :back
103
+ landmarks:
104
+ - - </td>
105
+ - - Status
106
+ - - </td>
107
+ start_rules:
108
+ - !ruby/object:Ariel::Rule
109
+ direction: :forward
110
+ landmarks:
111
+ - - <td>
112
+ - - <td>
113
+ :name: !ruby/object:Ariel::StructureNode
114
+ children: {}
115
+
116
+ meta: !ruby/object:OpenStruct
117
+ table:
118
+ :name: :name
119
+ :node_type: :not_list
120
+ parent: *id001
121
+ ruleset: !ruby/object:Ariel::RuleSet
122
+ end_rules:
123
+ - !ruby/object:Ariel::Rule
124
+ direction: :back
125
+ landmarks:
126
+ - - </title>
127
+ start_rules:
128
+ - !ruby/object:Ariel::Rule
129
+ direction: :forward
130
+ landmarks:
131
+ - - "-"
132
+ - - RAA
133
+ - "-"
134
+ :owner: !ruby/object:Ariel::StructureNode
135
+ children: {}
136
+
137
+ meta: !ruby/object:OpenStruct
138
+ table:
139
+ :name: :owner
140
+ :node_type: :not_list
141
+ parent: *id001
142
+ ruleset: !ruby/object:Ariel::RuleSet
143
+ end_rules:
144
+ - !ruby/object:Ariel::Rule
145
+ direction: :back
146
+ landmarks:
147
+ - - </a>
148
+ - - id
149
+ - - </a>
150
+ start_rules:
151
+ - !ruby/object:Ariel::Rule
152
+ direction: :forward
153
+ landmarks:
154
+ - - ">"
155
+ - - Owner
156
+ - - ">"
157
+ :license: !ruby/object:Ariel::StructureNode
158
+ children: {}
159
+
160
+ meta: !ruby/object:OpenStruct
161
+ table:
162
+ :name: :license
163
+ :node_type: :not_list
164
+ parent: *id001
165
+ ruleset: !ruby/object:Ariel::RuleSet
166
+ end_rules:
167
+ - !ruby/object:Ariel::Rule
168
+ direction: :back
169
+ landmarks:
170
+ - - </td>
171
+ - - Dependency
172
+ - - </td>
173
+ start_rules:
174
+ - !ruby/object:Ariel::Rule
175
+ direction: :forward
176
+ landmarks:
177
+ - - <td>
178
+ - - License
179
+ - - <td>
180
+ meta: !ruby/object:OpenStruct
181
+ table:
182
+ :name: :root
183
+ :node_type: :not_list
@@ -0,0 +1,175 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
2
+ <html>
3
+ <head>
4
+ <base href="http://raa.ruby-lang.org/">
5
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
6
+ <meta name="Author" content="ruby-lang.org">
7
+ <meta http-equiv="content-style-type" content="text/css">
8
+ <link rev="made" href="mailto:raa-admin@ruby-lang.org">
9
+ <link rel="home" href="http://raa.ruby-lang.org/">
10
+ <link rel="index" href="index.html">
11
+ <link rel="search" href="search.rhtml">
12
+ <link rel="glossary" href="all.html">
13
+ <link rel="SHORTCUT ICON" href="/favicon.ico">
14
+ <link rel="stylesheet" href="raa.css" type="text/css" media="all">
15
+
16
+ <title>RAA - pdf-writer</title>
17
+ </head>
18
+
19
+ <body>
20
+ <div class="header">
21
+ <h1>RAA - pdf-writer</h1>
22
+ </div>
23
+
24
+ <form method="get" action="search.rhtml">
25
+ <div class="header-searchbox">
26
+ <input name="search" type="text" size="20" maxlength="63"><input type="submit" value="Search"><br>
27
+ <a href="index.html#search">advanced search</a>
28
+ </div>
29
+ </form>
30
+
31
+
32
+ <p class="caption">
33
+ pdf-writer / 1.1.3
34
+ </p>
35
+
36
+ <table class="entry">
37
+
38
+ <tr><th>Short description: </th>
39
+ <td>Native Ruby PDF Document Writer</td>
40
+ </tr>
41
+ <tr><th>Category: </th>
42
+ <td><a href="cat.rhtml?category_major=Library">Library</a>/<a href="cat.rhtml?category_major=Library;category_minor=Document">Document</a></td>
43
+ </tr>
44
+ <tr><th>Status: </th>
45
+ <td>Stable</td>
46
+ </tr>
47
+ <tr><th>Created: </th>
48
+ <td>2003-09-17 03:59:34 GMT</td>
49
+ </tr>
50
+ <tr>
51
+ <th>Last update: </th>
52
+ <td>2005-09-10 02:12:20 GMT</td>
53
+ </tr>
54
+ <tr><th>Owner: </th>
55
+ <td><a href="mailto:austin+raa@halostatue.ca">Austin Ziegler</a>
56
+ (<a href="owner.rhtml?id=788">Projects of this owner</a>)</td>
57
+ </tr>
58
+ <tr><th>Homepage: </th>
59
+ <td><a href="http://ruby-pdf.rubyforge.org/pdf-writer/">http://ruby-pdf.rubyforge.org/pdf-writer/</a></td>
60
+ </tr>
61
+ <tr><th>Download: </th>
62
+ <td>
63
+ <a href="http://rubyforge.org/frs/?group_id=81">http://rubyforge.org/frs/?group_id=81</a>
64
+
65
+ </td>
66
+ </tr>
67
+
68
+ <tr><th>License: </th>
69
+ <td>BSD-type</td>
70
+ </tr>
71
+ <tr><th>Dependency: </th>
72
+ <td colspan='5'>
73
+ <table>
74
+
75
+ <tr width="100%">
76
+ <td nowrap>Requires:</td>
77
+ <td nowrap>
78
+
79
+ <a href="project/trans-simple/1.3.0">trans-simple/1.3.0</a>
80
+
81
+ </td>
82
+ <td width="100%">Transaction::Simple</td>
83
+ </tr>
84
+
85
+ <tr width="100%">
86
+ <td nowrap>Requires:</td>
87
+ <td nowrap>
88
+
89
+ <a href="project/color-tools/1.3.0">color-tools/1.3.0</a>
90
+
91
+ </td>
92
+ <td width="100%">color-tools</td>
93
+ </tr>
94
+
95
+ </table>
96
+ </td>
97
+ </tr>
98
+ <tr><th>Description: </th>
99
+ <td><h1>PDF::Writer for Ruby</h1>
100
+ <p>
101
+ This library provides the ability to create PDF documents using only native
102
+ Ruby libraries. There are several demo programs available in the demo/
103
+ directory. The canonical documentation for PDF::Writer is
104
+ &quot;manual.pdf&quot;, which can be generated using bin/techbook (just
105
+ &quot;techbook&quot; for RubyGem users) and the manual file
106
+ &quot;manual.pwd&quot;.
107
+ </p>
108
+ <table>
109
+ <tr><td valign="top">Homepage:</td><td><a
110
+ href="http://rubyforge.org/projects/ruby-pdf">rubyforge.org/projects/ruby-pdf</a>/
111
+
112
+ </td></tr>
113
+ <tr><td valign="top">Copyright:</td><td>2003—2005, Austin Ziegler
114
+
115
+ </td></tr>
116
+ </table>
117
+ <p>
118
+ This software is based on Adobe&#8217;s PDF Reference, Fifth Edition,
119
+ version 1.6. This and earlier editions are available from Adobe&#8217;s PDF
120
+ developer <a
121
+ href="http://partners.adobe.com/public/developer/pdf/index_reference.html">website</a>.
122
+ </p>
123
+ <h2>LICENCE NOTES</h2>
124
+ <p>
125
+ Please read the file LICENCE for licensing restrictions on this library, as
126
+ well as important patent considerations.
127
+ </p>
128
+ <h2>Requirements</h2>
129
+ <p>
130
+ PDF::Writer requires Ruby 1.8.2 or better, color-tools 1.3.0 or better, and
131
+ Transaction::Simple 1.3.0 or better.
132
+ </p>
133
+ </td>
134
+ </tr>
135
+
136
+ <tr><th>Versions: </th>
137
+ <td>
138
+ [<a href="project/pdf-writer/1.1.3">1.1.3</a> (2005-09-10)]
139
+
140
+ [<a href="project/pdf-writer/1.1.2">1.1.2</a> (2005-08-25)]
141
+
142
+ [<a href="project/pdf-writer/1.1.1">1.1.1</a> (2005-07-01)]
143
+
144
+ [<a href="project/pdf-writer/1.1.0">1.1.0</a> (2005-06-30)]
145
+
146
+ [<a href="project/pdf-writer/1.0.1">1.0.1</a> (2005-06-13)]
147
+
148
+ [<a href="project/pdf-writer/1.0.0">1.0.0</a> (2005-06-13)]
149
+
150
+ [<a href="project/pdf-writer/Technology%20Preview">Technology Preview</a> (2004-06-14)]
151
+
152
+ </td>
153
+ </tr>
154
+
155
+ </table>
156
+
157
+ <p class="caption">
158
+ <a href="list.rhtml?name=pdf-writer">Edit this project (for project owner)</a>
159
+ </p>
160
+
161
+ <p class="caption">
162
+ <a href="index.html">back to RAA top</a>
163
+ </p>
164
+
165
+ <div class="footer">
166
+ <hr>
167
+ <address>
168
+ For all questions or comments, or if you have any inquiries about this page, contact <a title="Send Feedback for RAA" href="mailto:raa-admin@ruby-lang.org">raa-admin@ruby-lang.org</a>.
169
+ </address>
170
+ </div>
171
+
172
+ </body>
173
+ </html>
174
+
175
+
@@ -0,0 +1,94 @@
1
+ module Ariel
2
+
3
+ # Given an array of candidate Rules, and an array of LabeledStreams,
4
+ # allows heuristics to be applied to select the ideal Rule. All select_* instance
5
+ # methods will remove candidates from the internal candidates array.
6
+ class CandidateSelector
7
+
8
+ attr_accessor :candidates
9
+ def initialize(candidates, examples)
10
+ @candidates=candidates.dup #Just in case a CandidateSelector function directly modifies the array, affecting the original. Shouldn't happen.
11
+ @examples=examples
12
+ end
13
+
14
+ # Selects the Rule candidates that have the most matches of a given type
15
+ # against the given examples. e.g. select_best_by_match_type(:early, :perfect)
16
+ # will select the rules that have the most matches that are early or
17
+ # perfect.
18
+ def select_best_by_match_type(*match_types)
19
+ debug "Selecting best by match types #{match_types}"
20
+ return @candidates if @candidates.size==1
21
+ @candidates = highest_scoring_by do |rule|
22
+ rule_score=0
23
+ @examples.each do |example|
24
+ rule_score+=1 if rule.matches(example, *match_types)
25
+ end
26
+ rule_score #why doesn't return rule_score raise an error?
27
+ end
28
+ return @candidates
29
+ end
30
+
31
+ # All scoring functions use this indirectly. It iterates over each
32
+ # Rule candidate, and assigns it a score in a hash of index:score pairs.
33
+ # Each rule is yielded to the given block, which is expected to return that
34
+ # rule's score.
35
+ def score_by
36
+ score_hash={}
37
+ @candidates.each_with_index do |rule, index|
38
+ score_hash[index]= yield rule
39
+ end
40
+ return score_hash
41
+ end
42
+
43
+ # Takes a scoring function as a block, and yields each rule to it. Returns
44
+ # an array of the Rule candidates that have the highest score.
45
+ def highest_scoring_by(&scorer)
46
+ score_hash = score_by &scorer
47
+ best_score = score_hash.values.sort.last
48
+ highest_scorers=[]
49
+ score_hash.each do |candidate_index, score|
50
+ highest_scorers << @candidates[candidate_index] if score==best_score
51
+ end
52
+ debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
53
+ return highest_scorers
54
+ end
55
+
56
+ def select_with_fewer_wildcards
57
+ debug "Selecting the rules with the fewest wildcards"
58
+ @candidates = highest_scoring_by {|rule| -rule.wildcard_count} #hack or not?
59
+ return @candidates
60
+ end
61
+
62
+ def select_closest_to_label
63
+ debug "Selecting rules that match the examples closest to the label"
64
+ @candidates = highest_scoring_by do |rule|
65
+ rule_score=0
66
+ matched_examples=0
67
+ @examples.each do |example|
68
+ match_index = rule.apply_to(example)
69
+ if match_index.nil?
70
+ next
71
+ else
72
+ rule_score+= (example.label_index - match_index).abs
73
+ matched_examples+=1
74
+ end
75
+ end
76
+ rule_score = rule_score.to_f/matched_examples unless matched_examples==0 #mean distance from label_index
77
+ -rule_score #So highest scoring = closest to label index.
78
+ end
79
+ return @candidates
80
+ end
81
+
82
+ def select_with_longer_end_landmarks
83
+ debug "Selecting rules that have longer end landmarks"
84
+ @candidates = highest_scoring_by {|rule| rule.landmarks.last.size unless rule.landmarks.last.nil?}
85
+ end
86
+
87
+ # Returns a random candidate. Meant for making the final choice in case
88
+ # previous selections have still left multiple candidates.
89
+ def random_from_remaining
90
+ debug "Selecting random from last #{candidates.size} candidate rules"
91
+ @candidates.sort_by {rand}.first
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,59 @@
1
+ module Ariel
2
+
3
+ # Provides methods that read an example document, using a StructureNode tree
4
+ # to populate a tree of Nodes with each labeled example.
5
+ # TODO: Fix the UTF issues this implementation is bound to create.
6
+ class ExampleDocumentLoader
7
+
8
+ # Assumes it is passed a root parent
9
+ def self.load_labeled_example(file, structure, loaded_example_hash)
10
+ raise ArgumentError, "Passed structure is not root parent" if structure.parent
11
+ string = file.respond_to?(:read) ? file.read : file
12
+ tokenstream = TokenStream.new
13
+ tokenstream.tokenize(string, true)
14
+ root = ExtractedNode.new(:root, tokenstream, structure)
15
+ structure.apply_extraction_tree_on(root, true)
16
+ root.each_descendant(true) do |extracted_node|
17
+ if extracted_node.parent
18
+ loaded_example_hash[extracted_node.meta.structure] << extracted_node
19
+ end
20
+ extracted_node.tokenstream.remove_label_tags
21
+ end
22
+ return loaded_example_hash
23
+ end
24
+
25
+ def self.supervise_learning(structure, loaded_example_hash)
26
+ loaded_example_hash.each_pair do |structure_node, example_nodes|
27
+ start_examples=[]
28
+ end_examples=[]
29
+ example_nodes.each do |node|
30
+ start_tstream=node.parent.tokenstream #Rules are based on extracting from the parent
31
+ start_tstream.set_label_at(node.tokenstream.tokens.first.start_loc)
32
+ start_examples << start_tstream
33
+ end_tstream=node.parent.tokenstream.reverse
34
+ end_tstream.set_label_at(node.tokenstream.tokens.last.start_loc)
35
+ end_examples << end_tstream
36
+ end
37
+ learner = Learner.new(*start_examples)
38
+ start_rules = learner.learn_rule :forward
39
+ learner = Learner.new(*end_examples)
40
+ end_rules = learner.learn_rule :back
41
+ structure_node.ruleset=RuleSet.new(start_rules, end_rules)
42
+ end
43
+ end
44
+
45
+ def self.load_directory(dir, structure)
46
+ loaded_example_hash = Hash.new {|h, k| h[k]=[]}
47
+ Dir.glob("#{dir}/*") do |doc|
48
+ next if doc=~ /structure\.rb\z/
49
+ File.open(doc) do |file|
50
+ self.load_labeled_example(file, structure, loaded_example_hash)
51
+ end
52
+ end
53
+ self.supervise_learning structure, loaded_example_hash
54
+ return structure
55
+ end
56
+
57
+
58
+ end
59
+ end