huginn_naive_bayes_agent 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/huginn_naive_bayes_agent/naive_bayes_agent.rb +27 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4bf23b64717fc62020b16dcc96b3a9767e71f42
|
4
|
+
data.tar.gz: d3687649e9758f6f57e71478ec4edeb47c9d4c1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97a60c81731ae167e119b8bb68852341f15d7d5e1d6dd03ce728662a9adcdbe42ca5b2886dcfa8ba9e3c4cfe9186fd6f621f916dc3a303cd7a24c29182bcaf27
|
7
|
+
data.tar.gz: 86bc9dfe3906f16cac1851bc38257768ded8fbb7611a527c35946e19ac03939dd61e511e4056c44cc4171331be44f6c244fe13081c34fd2d18c37757c60c746b
|
@@ -18,7 +18,9 @@ module Agents
|
|
18
18
|
|
19
19
|
However, if `nb_cats` is already populated, then the content from `nb_content` will be used as training data for the categories listed in `nb_cats`. For instance, say `nb_cats` consists of `trees`. Then `nb_content` will be used as training data for the category `trees`. The data is saved to the agent memory.
|
20
20
|
|
21
|
-
|
21
|
+
Data in `nb_content` can be cleaned before classification. If `strip_punctuation` is set to true, the text in `nb_content` is stripped of punctuation before it is sent to the classifier. The changes are not saved to `nb_content`.
|
22
|
+
|
23
|
+
When an event is received for classification, the Naive Bayes Agent will assign a value between 0 and 1 representing the likelihood that it falls under a category. The `min_value` option lets you choose the minimum threshold that must be reached before the event is labeled with that category. If `min_value` is set to 1, then the event is labeled with whichever category has the highest value.
|
22
24
|
|
23
25
|
The option `propagate_training_events` lets you choose whether the training events are emitted along with the classified events. If it is set to false, then no new event will be created from events that already had categories when they were received.
|
24
26
|
|
@@ -43,7 +45,8 @@ module Agents
|
|
43
45
|
{
|
44
46
|
'min_value' => "0.5",
|
45
47
|
'propagate_training_events' => 'true',
|
46
|
-
'expected_update_period_in_days' => "7"
|
48
|
+
'expected_update_period_in_days' => "7",
|
49
|
+
'strip_punctuation' => 'false'
|
47
50
|
}
|
48
51
|
end
|
49
52
|
|
@@ -59,6 +62,16 @@ module Agents
|
|
59
62
|
def receive(incoming_events)
|
60
63
|
incoming_events.each do |event|
|
61
64
|
nbayes = load(memory['data'])
|
65
|
+
# validate incoming payload
|
66
|
+
if !event.payload['nb_cats']
|
67
|
+
error("Missing `nb_cats` field in the event payload. #{event.payload.to_s}")
|
68
|
+
raise 'Missing `nb_cats` field in the event payload.'
|
69
|
+
end
|
70
|
+
if !event.payload['nb_content']
|
71
|
+
error("Missing `nb_content` field in the event payload. #{event.payload.to_s}")
|
72
|
+
raise 'Missing `nb_content` field in the event payload.'
|
73
|
+
end
|
74
|
+
# train or modify existing classifier
|
62
75
|
if event.payload['nb_cats'].length > 0 and not event.payload['nb_cats'].include?("=class")
|
63
76
|
cats = event.payload['nb_cats'].split(/\s+/)
|
64
77
|
if cats[0] == "=loadYML"
|
@@ -73,16 +86,25 @@ module Agents
|
|
73
86
|
nbayes.purge_less_than(event.payload['nb_content'].to_i)
|
74
87
|
memory['data'] = YAML.dump(nbayes)
|
75
88
|
else
|
89
|
+
nb_content = event.payload['nb_content']
|
90
|
+
if interpolated['strip_punctuation'] == "true"
|
91
|
+
nb_content = nb_content.gsub(/[^[:word:]\s]/, '') #https://stackoverflow.com/a/10074271
|
92
|
+
end
|
76
93
|
cats.each do |c|
|
77
|
-
c.starts_with?('-') ? nbayes.untrain(
|
94
|
+
c.starts_with?('-') ? nbayes.untrain(nb_content.split(/\s+/), c[1..-1]) : nbayes.train(nb_content.split(/\s+/), c)
|
78
95
|
end
|
79
96
|
memory['data'] = YAML.dump(nbayes)
|
80
|
-
if interpolated['propagate_training_events']
|
97
|
+
if interpolated['propagate_training_events'] == "true"
|
81
98
|
create_event payload: event.payload
|
82
99
|
end
|
83
100
|
end
|
101
|
+
# classify new data
|
84
102
|
else
|
85
|
-
|
103
|
+
nb_content = event.payload['nb_content']
|
104
|
+
if interpolated['strip_punctuation'] == "true"
|
105
|
+
nb_content = nb_content.gsub(/[^[:word:]\s]/, '') #https://stackoverflow.com/a/10074271
|
106
|
+
end
|
107
|
+
result = nbayes.classify(nb_content.split(/\s+/))
|
86
108
|
if interpolated['min_value'].to_f == 1
|
87
109
|
event.payload['nb_cats'] << (event.payload['nb_cats'].length == 0 ? result.max_class : " "+result.max_class)
|
88
110
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: huginn_naive_bayes_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Greenstein
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|