huginn_naive_bayes_agent 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d1a1c1469bb599fe8a5a42a94e89fd6206e519d
4
- data.tar.gz: d2ce17b2c8471199add13b73c92047cb84881c4a
3
+ metadata.gz: c4bf23b64717fc62020b16dcc96b3a9767e71f42
4
+ data.tar.gz: d3687649e9758f6f57e71478ec4edeb47c9d4c1e
5
5
  SHA512:
6
- metadata.gz: cb4ae84beb82c52900b99d21c9ca3d542f9e99f1bee34794ef1bb92e6191559d89f4afa6c8a7f5cbf5918e1967df8dadd0c89c90bd9d8f7ffd32382017a4018e
7
- data.tar.gz: a4c1e57328ddc1fef840f73aeeeb5072fc251b2d5fe03e649c175e43b46515775f0fa74336d3bbd1964280ee83276e6a8a0d431a786b7c59998c1ec188cdcc07
6
+ metadata.gz: 97a60c81731ae167e119b8bb68852341f15d7d5e1d6dd03ce728662a9adcdbe42ca5b2886dcfa8ba9e3c4cfe9186fd6f621f916dc3a303cd7a24c29182bcaf27
7
+ data.tar.gz: 86bc9dfe3906f16cac1851bc38257768ded8fbb7611a527c35946e19ac03939dd61e511e4056c44cc4171331be44f6c244fe13081c34fd2d18c37757c60c746b
@@ -18,7 +18,9 @@ module Agents
18
18
 
19
19
  However, if `nb_cats` is already populated, then the content from `nb_content` will be used as training data for the categories listed in `nb_cats`. For instance, say `nb_cats` consists of `trees`. Then `nb_content` will be used as training data for the category `trees`. The data is saved to the agent memory.
20
20
 
21
- When an event is received for classification, the Naive Bayes Agent will assign a value between 0 and 1 representing the likelihood that it falls under a category. The `min_value` option lets you choose the minimum threshold that must be reached before the event is labeled with that category. If `min_value` is set to 1, then the event is labeled with whichever category has the highest value.
21
+ Data in `nb_content` can be cleaned before classification. If `strip_punctuation` is set to true, the text in `nb_content` is stripped of punctuation before it is sent to the classifier. The changes are not saved to `nb_content`.
22
+
23
+ When an event is received for classification, the Naive Bayes Agent will assign a value between 0 and 1 representing the likelihood that it falls under a category. The `min_value` option lets you choose the minimum threshold that must be reached before the event is labeled with that category. If `min_value` is set to 1, then the event is labeled with whichever category has the highest value.
22
24
 
23
25
  The option `propagate_training_events` lets you choose whether the training events are emitted along with the classified events. If it is set to false, then no new event will be created from events that already had categories when they were received.
24
26
 
@@ -43,7 +45,8 @@ module Agents
43
45
  {
44
46
  'min_value' => "0.5",
45
47
  'propagate_training_events' => 'true',
46
- 'expected_update_period_in_days' => "7"
48
+ 'expected_update_period_in_days' => "7",
49
+ 'strip_punctuation' => 'false'
47
50
  }
48
51
  end
49
52
 
@@ -59,6 +62,16 @@ module Agents
59
62
  def receive(incoming_events)
60
63
  incoming_events.each do |event|
61
64
  nbayes = load(memory['data'])
65
+ # validate incoming payload
66
+ if !event.payload['nb_cats']
67
+ error("Missing `nb_cats` field in the event payload. #{event.payload.to_s}")
68
+ raise 'Missing `nb_cats` field in the event payload.'
69
+ end
70
+ if !event.payload['nb_content']
71
+ error("Missing `nb_content` field in the event payload. #{event.payload.to_s}")
72
+ raise 'Missing `nb_content` field in the event payload.'
73
+ end
74
+ # train or modify existing classifier
62
75
  if event.payload['nb_cats'].length > 0 and not event.payload['nb_cats'].include?("=class")
63
76
  cats = event.payload['nb_cats'].split(/\s+/)
64
77
  if cats[0] == "=loadYML"
@@ -73,16 +86,25 @@ module Agents
73
86
  nbayes.purge_less_than(event.payload['nb_content'].to_i)
74
87
  memory['data'] = YAML.dump(nbayes)
75
88
  else
89
+ nb_content = event.payload['nb_content']
90
+ if interpolated['strip_punctuation'] == "true"
91
+ nb_content = nb_content.gsub(/[^[:word:]\s]/, '') #https://stackoverflow.com/a/10074271
92
+ end
76
93
  cats.each do |c|
77
- c.starts_with?('-') ? nbayes.untrain(event.payload['nb_content'].split(/\s+/), c[1..-1]) : nbayes.train(event.payload['nb_content'].split(/\s+/), c)
94
+ c.starts_with?('-') ? nbayes.untrain(nb_content.split(/\s+/), c[1..-1]) : nbayes.train(nb_content.split(/\s+/), c)
78
95
  end
79
96
  memory['data'] = YAML.dump(nbayes)
80
- if interpolated['propagate_training_events'] = "true"
97
+ if interpolated['propagate_training_events'] == "true"
81
98
  create_event payload: event.payload
82
99
  end
83
100
  end
101
+ # classify new data
84
102
  else
85
- result = nbayes.classify(event.payload['nb_content'].split(/\s+/))
103
+ nb_content = event.payload['nb_content']
104
+ if interpolated['strip_punctuation'] == "true"
105
+ nb_content = nb_content.gsub(/[^[:word:]\s]/, '') #https://stackoverflow.com/a/10074271
106
+ end
107
+ result = nbayes.classify(nb_content.split(/\s+/))
86
108
  if interpolated['min_value'].to_f == 1
87
109
  event.payload['nb_cats'] << (event.payload['nb_cats'].length == 0 ? result.max_class : " "+result.max_class)
88
110
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: huginn_naive_bayes_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Greenstein
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-04 00:00:00.000000000 Z
11
+ date: 2017-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler