opener-kaf-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +72 -0
- data/doc/css/common.css +68 -0
- data/lib/opener/kaf_parser.rb +16 -0
- data/lib/opener/kaf_parser/ast/base.rb +76 -0
- data/lib/opener/kaf_parser/ast/document.rb +33 -0
- data/lib/opener/kaf_parser/ast/opinion.rb +52 -0
- data/lib/opener/kaf_parser/ast/text.rb +75 -0
- data/lib/opener/kaf_parser/parser.rb +25 -0
- data/lib/opener/kaf_parser/presenter/html.rb +111 -0
- data/lib/opener/kaf_parser/presenter/text.rb +69 -0
- data/lib/opener/kaf_parser/sax_parser.rb +351 -0
- data/lib/opener/kaf_parser/version.rb +5 -0
- data/opener-kaf-parser.gemspec +29 -0
- metadata +155 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: efcd70b4613807928b75e36c34e96b8ec22737e1
|
4
|
+
data.tar.gz: 7fb60e7fb63d37192efd2e2938b2a503e712358d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b920bcb157bc0d1c5ce592ea7da6c454b89b2842dc055712d662f3a3fd8cdd762ecc3f61a4cfc15cc26e11e87ee701ab8a116734e92d13996d7b5efc0fa03d7c
|
7
|
+
data.tar.gz: cb85a15b2dc234e5d9abe33222f64131cbb6491a40ea243b3178065d73f30cb050c350cc9ce064a6fa7ac2c85d4fbf27bd1f9341971e3e47a417da9ce5912753
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2013, Olery
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
[](https://drone.io/github.com/opener-project/ruby-kaf-parser/latest)
|
2
|
+
|
3
|
+
# Ruby KAF Parser
|
4
|
+
|
5
|
+
This repository contains the source code of the opener-kaf-parser, a simple and
|
6
|
+
fast KAF parser based on Nokogiri. The KAF parser is a stack based parser that
|
7
|
+
uses the SAX parsing API of Nokogiri, thus it should (in theory) be able to
|
8
|
+
handle large KAF files without too much trouble.
|
9
|
+
|
10
|
+
## Usage
|
11
|
+
|
12
|
+
Create a parser instance and parse some KAF:
|
13
|
+
|
14
|
+
require 'opener/kaf_parser'
|
15
|
+
|
16
|
+
parser = Opener::KafParser::Parser.new
|
17
|
+
ast = parser.parse('...')
|
18
|
+
|
19
|
+
The return value is a list of `Opener::KafParser::AST` nodes which behave like
|
20
|
+
S expressions (and are formatted that way when calling `#inspect` on them).
|
21
|
+
Currently there are 3 node types:
|
22
|
+
|
23
|
+
* document
|
24
|
+
* text
|
25
|
+
* opinion
|
26
|
+
|
27
|
+
The latter groups a set of text nodes together that make up the opinion.
|
28
|
+
|
29
|
+
To iterate over these nodes you'd do something along the lines of the
|
30
|
+
following:
|
31
|
+
|
32
|
+
ast.language # => "en"
|
33
|
+
|
34
|
+
ast.children.each do |node|
|
35
|
+
if node.type == :text
|
36
|
+
puts "Word: #{node.inspect}"
|
37
|
+
else
|
38
|
+
puts "Opinion: #{node.inspect}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
## Presenting Text
|
43
|
+
|
44
|
+
To present an AST/text you can use one of the standard presenter classes. For
|
45
|
+
example, if you want to turn an AST in a regular Ruby String you can use the
|
46
|
+
Text presenter:
|
47
|
+
|
48
|
+
ast = parser.parse('...')
|
49
|
+
presenter = Opener::KafParser::Presenter::Text.new
|
50
|
+
|
51
|
+
puts presenter.present(ast) # => "Hello, you are doing great"
|
52
|
+
|
53
|
+
Currently the following presenters are available:
|
54
|
+
|
55
|
+
* `Opener::KafParser::Presenter::Text`
|
56
|
+
* `Opener::KafParser::Presenter::HTML`
|
57
|
+
|
58
|
+
## Requirements
|
59
|
+
|
60
|
+
* Ruby 1.9.3 or newer
|
61
|
+
* libxml2 (newer versions of Nokogiri ship libxml themselves)
|
62
|
+
|
63
|
+
## Installation:
|
64
|
+
|
65
|
+
Installing as a Gem:
|
66
|
+
|
67
|
+
gem install opener-kaf-parser
|
68
|
+
|
69
|
+
Using Bundler:
|
70
|
+
|
71
|
+
gem 'opener-kaf-parser',
|
72
|
+
:git => 'git@github.com:opener-project/ruby-kaf-parser'
|
data/doc/css/common.css
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
body
|
2
|
+
{
|
3
|
+
font-size: 14px;
|
4
|
+
line-height: 1.6;
|
5
|
+
margin: 0 auto;
|
6
|
+
max-width: 960px;
|
7
|
+
}
|
8
|
+
|
9
|
+
p code
|
10
|
+
{
|
11
|
+
background: #f2f2f2;
|
12
|
+
padding-left: 3px;
|
13
|
+
padding-right: 3px;
|
14
|
+
}
|
15
|
+
|
16
|
+
pre.code
|
17
|
+
{
|
18
|
+
font-size: 13px;
|
19
|
+
line-height: 1.4;
|
20
|
+
}
|
21
|
+
|
22
|
+
/**
|
23
|
+
* YARD uses generic table styles, using a special class means those tables
|
24
|
+
* don't get messed up.
|
25
|
+
*/
|
26
|
+
.table
|
27
|
+
{
|
28
|
+
border: 1px solid #ccc;
|
29
|
+
border-right: none;
|
30
|
+
border-collapse: separate;
|
31
|
+
border-spacing: 0;
|
32
|
+
text-align: left;
|
33
|
+
}
|
34
|
+
|
35
|
+
.table.full
|
36
|
+
{
|
37
|
+
width: 100%;
|
38
|
+
}
|
39
|
+
|
40
|
+
.table .field_name
|
41
|
+
{
|
42
|
+
min-width: 160px;
|
43
|
+
}
|
44
|
+
|
45
|
+
.table thead tr th.no_sort:first-child
|
46
|
+
{
|
47
|
+
width: 25px;
|
48
|
+
}
|
49
|
+
|
50
|
+
.table thead tr th, .table tbody tr td
|
51
|
+
{
|
52
|
+
border-bottom: 1px solid #ccc;
|
53
|
+
border-right: 1px solid #ccc;
|
54
|
+
min-width: 20px;
|
55
|
+
padding: 8px 5px;
|
56
|
+
text-align: left;
|
57
|
+
vertical-align: top;
|
58
|
+
}
|
59
|
+
|
60
|
+
.table tbody tr:last-child td
|
61
|
+
{
|
62
|
+
border-bottom: none;
|
63
|
+
}
|
64
|
+
|
65
|
+
.table tr:nth-child(odd) td
|
66
|
+
{
|
67
|
+
background: #f9f9f9;
|
68
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'time'
|
3
|
+
require 'builder'
|
4
|
+
|
5
|
+
require_relative 'kaf_parser/version'
|
6
|
+
|
7
|
+
require_relative 'kaf_parser/ast/base'
|
8
|
+
require_relative 'kaf_parser/ast/document'
|
9
|
+
require_relative 'kaf_parser/ast/text'
|
10
|
+
require_relative 'kaf_parser/ast/opinion'
|
11
|
+
|
12
|
+
require_relative 'kaf_parser/sax_parser'
|
13
|
+
require_relative 'kaf_parser/parser'
|
14
|
+
|
15
|
+
require_relative 'kaf_parser/presenter/text'
|
16
|
+
require_relative 'kaf_parser/presenter/html'
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# Base node class that provides some common boilerplate for the various
|
6
|
+
# other node classes.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] type
|
9
|
+
# @return [Symbol]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] value
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] children
|
15
|
+
# @return [Array<Opener::KafParser::AST::Base>]
|
16
|
+
#
|
17
|
+
class Base
|
18
|
+
attr_accessor :type, :value, :children
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param [Hash] attributes
|
22
|
+
#
|
23
|
+
def initialize(attributes = {})
|
24
|
+
attributes.each do |key, value|
|
25
|
+
instance_variable_set("@#{key}", value) if respond_to?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
@children ||= []
|
29
|
+
@type ||= :generic
|
30
|
+
|
31
|
+
after_initialize if respond_to?(:after_initialize)
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# @return [String]
|
36
|
+
#
|
37
|
+
def inspect(indent = 0)
|
38
|
+
spaces = ' ' * indent
|
39
|
+
child_values = children.map { |c| c.inspect(indent + 2) }
|
40
|
+
segments = ["#{spaces}(#{type}"]
|
41
|
+
|
42
|
+
if value
|
43
|
+
segments << "#{value.inspect}"
|
44
|
+
end
|
45
|
+
|
46
|
+
unless child_values.empty?
|
47
|
+
segments << "\n#{child_values.join("\n")}"
|
48
|
+
end
|
49
|
+
|
50
|
+
return segments.join(' ') + ')'
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# @return [Hash]
|
55
|
+
#
|
56
|
+
def attributes
|
57
|
+
return {}
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# @return [TrueClass|FalseClass]
|
62
|
+
#
|
63
|
+
def text?
|
64
|
+
return type == :text
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# @return [TrueClass|FalseClass]
|
69
|
+
#
|
70
|
+
def opinion?
|
71
|
+
return type == :opinion
|
72
|
+
end
|
73
|
+
end # Base
|
74
|
+
end # AST
|
75
|
+
end # KafParser
|
76
|
+
end # Opener
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# The Document node class contains information about a `<KAF>` tag and
|
6
|
+
# all the child nodes.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] language
|
9
|
+
# @return [String]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] version
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
class Document < Base
|
15
|
+
attr_accessor :language, :version
|
16
|
+
|
17
|
+
##
|
18
|
+
# Called after a new instance of this class is created.
|
19
|
+
#
|
20
|
+
def after_initialize
|
21
|
+
@type = :document
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @return [Hash]
|
26
|
+
#
|
27
|
+
def attributes
|
28
|
+
return {:language => language, :version => version}
|
29
|
+
end
|
30
|
+
end # Document
|
31
|
+
end # AST
|
32
|
+
end # KafParser
|
33
|
+
end # Opener
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# The Opinion node class contains information about a opinion, the
|
6
|
+
# expression, polarity and more. The nodes that make up the expression of
|
7
|
+
# the opinion are stored in the `children` method.
|
8
|
+
#
|
9
|
+
# @!attribute [rw] id
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
# @!attribute [rw] holder The nodes that make up the opinion holder.
|
13
|
+
# @return [Array]
|
14
|
+
#
|
15
|
+
# @!attribute [rw] target The nodes that make up the opinion target.
|
16
|
+
# @return [Array]
|
17
|
+
#
|
18
|
+
# @!attribute [rw] polarity
|
19
|
+
# @return [String]
|
20
|
+
#
|
21
|
+
# @!attribute [rw] strength
|
22
|
+
# @return [Numeric]
|
23
|
+
#
|
24
|
+
class Opinion < Base
|
25
|
+
attr_accessor :id, :holder, :target, :polarity, :strength
|
26
|
+
|
27
|
+
##
|
28
|
+
# Called after a new instance of this class is created.
|
29
|
+
#
|
30
|
+
def after_initialize
|
31
|
+
@type = :opinion
|
32
|
+
|
33
|
+
@holder ||= []
|
34
|
+
@target ||= []
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# @return [Hash]
|
39
|
+
#
|
40
|
+
def attributes
|
41
|
+
return {
|
42
|
+
:id => id,
|
43
|
+
:holder => holder,
|
44
|
+
:target => target,
|
45
|
+
:polarity => polarity,
|
46
|
+
:strength => strength
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end # Opinion
|
50
|
+
end # AST
|
51
|
+
end # KafParser
|
52
|
+
end # Opener
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# Node class that contains information about a set of characters such as
|
6
|
+
# the polarity and POS.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] id
|
9
|
+
# @return [Numeric]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] sentence
|
12
|
+
# @return [Numeric]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] paragraph
|
15
|
+
# @return [Numeric]
|
16
|
+
#
|
17
|
+
# @!attribute [rw] offset
|
18
|
+
# @return [Numeric]
|
19
|
+
#
|
20
|
+
# @!attribute [rw] length
|
21
|
+
# @return [Numeric]
|
22
|
+
#
|
23
|
+
# @!attribute [r] word_type
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
# @!attribute [r] pos
|
27
|
+
# @return [String]
|
28
|
+
#
|
29
|
+
# @!attribute [rw] morphofeat
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
# @!attribute [rw] sentiment_modifier
|
33
|
+
# @return [String]
|
34
|
+
#
|
35
|
+
# @!attribute [rw] polarity
|
36
|
+
# @return [String]
|
37
|
+
#
|
38
|
+
# @!attribute [rw] property
|
39
|
+
# @return [String]
|
40
|
+
#
|
41
|
+
class Text < Base
|
42
|
+
attr_accessor :id, :sentence, :paragraph, :offset, :length, :word_type,
|
43
|
+
:pos, :morphofeat, :sentiment_modifier, :polarity, :property
|
44
|
+
|
45
|
+
##
|
46
|
+
# Called after a new instance of this class is created.
|
47
|
+
#
|
48
|
+
def after_initialize
|
49
|
+
@type = :text
|
50
|
+
|
51
|
+
@length ||= value.length
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [Hash]
|
56
|
+
#
|
57
|
+
def attributes
|
58
|
+
return {
|
59
|
+
:id => id,
|
60
|
+
:sentence => sentence,
|
61
|
+
:paragraph => paragraph,
|
62
|
+
:offset => offset,
|
63
|
+
:length => length,
|
64
|
+
:word_type => word_type,
|
65
|
+
:pos => pos,
|
66
|
+
:morphofeat => morphofeat,
|
67
|
+
:sentiment_modifier => sentiment_modifier,
|
68
|
+
:polarity => polarity,
|
69
|
+
:property => property
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end # Generic
|
73
|
+
end # AST
|
74
|
+
end # KafParser
|
75
|
+
end # Opener
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
##
|
4
|
+
# The Parser class acts as a slightly more user friendly interface around
|
5
|
+
# the Nokogiri SAX based parser.
|
6
|
+
#
|
7
|
+
class Parser
|
8
|
+
##
|
9
|
+
# Parses the input KAF/XML and returns an instance of
|
10
|
+
# {Opener::KafParser::AST::Document}.
|
11
|
+
#
|
12
|
+
# @param [String] input The XML/KAF to parse.
|
13
|
+
# @return [Opener::KafParser::AST::Document]
|
14
|
+
#
|
15
|
+
def parse(input)
|
16
|
+
sax_parser = SaxParser.new
|
17
|
+
nokogiri_parser = Nokogiri::XML::SAX::Parser.new(sax_parser)
|
18
|
+
|
19
|
+
nokogiri_parser.parse(input)
|
20
|
+
|
21
|
+
return sax_parser.document
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # KafParser
|
25
|
+
end # Opener
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module Presenter
|
4
|
+
##
|
5
|
+
# The HTML presenter takes an AST and turns it into a block of HTML where
|
6
|
+
# each word is wrapped in a tag and has various meta information (e.g.
|
7
|
+
# the polarity) assigned to it.
|
8
|
+
#
|
9
|
+
# Basic usage:
|
10
|
+
#
|
11
|
+
# parser = Opener::KafParser::Parser.new
|
12
|
+
# ast = parser.parse('...')
|
13
|
+
# presenter = Opener::KafParser::Presenter::HTML.new
|
14
|
+
#
|
15
|
+
# puts presenter.present(ast)
|
16
|
+
#
|
17
|
+
# ## Output
|
18
|
+
#
|
19
|
+
# The output is a set of span tags for each set of characters, span tags
|
20
|
+
# for whitespace and a set of span tags that group opinion expressions.
|
21
|
+
# Each span tag has a class indicating the type ("text", "opinion", etc)
|
22
|
+
# and a set of `data-*` attributes containing data such as the polarity.
|
23
|
+
# For example, the ID of a text node would be stored in `data-id`, the
|
24
|
+
# polarity in `data-polarity` and so forth.
|
25
|
+
#
|
26
|
+
class HTML < Text
|
27
|
+
##
|
28
|
+
# @return [String]
|
29
|
+
#
|
30
|
+
SPACE = ' '
|
31
|
+
|
32
|
+
##
|
33
|
+
# @return [Array]
|
34
|
+
#
|
35
|
+
TYPES_WHITELIST = [String, Numeric]
|
36
|
+
|
37
|
+
##
|
38
|
+
# Presents the AST as a collection of HTML tags.
|
39
|
+
#
|
40
|
+
# @param [Opener::KafParser::AST::Base] ast
|
41
|
+
# @return [String]
|
42
|
+
#
|
43
|
+
def present(ast)
|
44
|
+
offset = 0
|
45
|
+
builder = Builder::XmlMarkup.new
|
46
|
+
|
47
|
+
render_ast(ast, offset, builder)
|
48
|
+
|
49
|
+
return builder.target!
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
##
|
55
|
+
# @param [Opener::KafParser::AST::Base] ast
|
56
|
+
# @param [Numeric] offset
|
57
|
+
# @param [Builder::XmlMarkup] builder
|
58
|
+
#
|
59
|
+
def render_ast(ast, offset, builder)
|
60
|
+
ast.children.each do |node|
|
61
|
+
if node.text?
|
62
|
+
offset = render_node(node, offset, builder)
|
63
|
+
else
|
64
|
+
render_span(node, builder) do |sub_builder|
|
65
|
+
render_ast(node, offset, builder)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# @see #render_ast
|
73
|
+
#
|
74
|
+
def render_node(node, offset, builder)
|
75
|
+
diff = node.offset - offset
|
76
|
+
|
77
|
+
if diff > 0
|
78
|
+
builder.span(:class => 'whitespace') do |sub_builder|
|
79
|
+
sub_builder << SPACE * diff
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
render_span(node, builder)
|
84
|
+
|
85
|
+
return calculate_offset(node)
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# @param [Opener::KafParser::AST::Base] node
|
90
|
+
# @param [Builder::XmlMarkup] builder
|
91
|
+
#
|
92
|
+
def render_span(node, builder)
|
93
|
+
attrs = {'class' => node.type}
|
94
|
+
|
95
|
+
# Only store simple values in the HTML attributes.
|
96
|
+
node.attributes.each do |key, value|
|
97
|
+
if TYPES_WHITELIST.include?(value.class)
|
98
|
+
attrs["data-#{key}"] = value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
if block_given?
|
103
|
+
builder.span(node.value, attrs) { |sub_builder| yield sub_builder }
|
104
|
+
else
|
105
|
+
builder.span(node.value, attrs)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end # HTML
|
109
|
+
end # Presenter
|
110
|
+
end # KafParser
|
111
|
+
end # Opener
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module Presenter
|
4
|
+
##
|
5
|
+
# The Text presenter class takes an AST and builds a plain Ruby string
|
6
|
+
# containing the correct whitespace between various nodes.
|
7
|
+
#
|
8
|
+
class Text
|
9
|
+
##
|
10
|
+
# Presents the AST as a plain Ruby String with no special formatting.
|
11
|
+
#
|
12
|
+
# @param [Opener::KafParser::AST::Base] ast
|
13
|
+
# @return [String]
|
14
|
+
#
|
15
|
+
def present(ast)
|
16
|
+
offset = 0
|
17
|
+
buffer = ''
|
18
|
+
|
19
|
+
render_ast(ast, offset, buffer)
|
20
|
+
|
21
|
+
return buffer
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
##
|
27
|
+
# @param [Opener::KafParser::AST::Base] ast
|
28
|
+
# @param [Numeric] offset
|
29
|
+
# @param [String] buffer
|
30
|
+
#
|
31
|
+
def render_ast(ast, offset, buffer)
|
32
|
+
ast.children.each do |node|
|
33
|
+
if node.text?
|
34
|
+
offset = render_node(node, offset, buffer)
|
35
|
+
else
|
36
|
+
render_ast(node, offset, buffer)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# @param [Opener::KafParser::AST::Text] node
|
43
|
+
# @param [Numeric] offset
|
44
|
+
# @param [String] buffer
|
45
|
+
# @return [Numeric]
|
46
|
+
#
|
47
|
+
def render_node(node, offset, buffer)
|
48
|
+
diff = node.offset - offset
|
49
|
+
|
50
|
+
if diff > 0
|
51
|
+
buffer << ' ' * diff
|
52
|
+
end
|
53
|
+
|
54
|
+
buffer << node.value
|
55
|
+
|
56
|
+
return calculate_offset(node)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# @param [Opener::KafParser::AST::Text] node
|
61
|
+
# @return [Numeric]
|
62
|
+
#
|
63
|
+
def calculate_offset(node)
|
64
|
+
return node.offset + node.length
|
65
|
+
end
|
66
|
+
end # Text
|
67
|
+
end # Presenter
|
68
|
+
end # KafParser
|
69
|
+
end # Opener
|
@@ -0,0 +1,351 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
##
|
4
|
+
# The SaxParser class is a Nokogiri SAX parser that builds a list of
|
5
|
+
# {Opener::KafParser::AST::Base} nodes containing word information such as
|
6
|
+
# the polarity and Part Of Speech as well as grouping words together based
|
7
|
+
# on the opinion expression they belong to.
|
8
|
+
#
|
9
|
+
# This SAX parser is a stack based parser and parses only relevant
|
10
|
+
# information of KAF documents. For example, the `<head>` of a KAF document
|
11
|
+
# is completely ignored.
|
12
|
+
#
|
13
|
+
# @!attribute [r] document
|
14
|
+
# @return [Opener::KafParser::Element::Document]
|
15
|
+
#
|
16
|
+
class SaxParser < Nokogiri::XML::SAX::Document
|
17
|
+
attr_reader :document
|
18
|
+
|
19
|
+
##
|
20
|
+
# @see Nokogiri::XML::SAX::Document#initialize
|
21
|
+
#
|
22
|
+
def initialize(*args)
|
23
|
+
super
|
24
|
+
|
25
|
+
@stack = []
|
26
|
+
@attributes = []
|
27
|
+
@document = nil
|
28
|
+
@characters = ''
|
29
|
+
@targets = []
|
30
|
+
@buffer_characters = false
|
31
|
+
@buffer_targets = false
|
32
|
+
@word_mapping = {}
|
33
|
+
@term_mapping = {}
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Called at the start of an XML element. This method delegates the work
|
38
|
+
# to individual method calls based on the node name.
|
39
|
+
#
|
40
|
+
# @param [String] name The name of the element.
|
41
|
+
# @param [Array] attributes
|
42
|
+
#
|
43
|
+
def start_element(name, attributes)
|
44
|
+
callback = 'on_' + callback_name(name)
|
45
|
+
attributes = associate_attributes(attributes)
|
46
|
+
|
47
|
+
execute_callback(callback, attributes)
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# @param [String] name The name of the element.
|
52
|
+
#
|
53
|
+
def end_element(name)
|
54
|
+
callback = 'after_' + callback_name(name)
|
55
|
+
|
56
|
+
execute_callback(callback)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# Processes the characters of an XML node.
|
61
|
+
#
|
62
|
+
# @param [String] text
|
63
|
+
#
|
64
|
+
def characters(text)
|
65
|
+
@characters << text if @buffer_characters
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Processes a `<KAF>` node.
|
70
|
+
#
|
71
|
+
# @param [Hash] attr
|
72
|
+
#
|
73
|
+
def on_kaf(attr)
|
74
|
+
@stack << AST::Document.new(
|
75
|
+
:language => attr.fetch('xml:lang', 'en'),
|
76
|
+
:version => attr['version']
|
77
|
+
)
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# @see #on_kaf
|
82
|
+
#
|
83
|
+
def after_kaf
|
84
|
+
@document = @stack.pop
|
85
|
+
end
|
86
|
+
##
|
87
|
+
# Processes a `<wf>` node.
|
88
|
+
#
|
89
|
+
# @param [Hash] attr
|
90
|
+
#
|
91
|
+
def on_wf(attr)
|
92
|
+
@stack << AST::Text.new(
|
93
|
+
:id => attr['wid'],
|
94
|
+
:sentence => attr['sent'].to_i,
|
95
|
+
:offset => attr['offset'].to_i,
|
96
|
+
:length => attr['length'].to_i,
|
97
|
+
:paragraph => attr['para'].to_i
|
98
|
+
)
|
99
|
+
|
100
|
+
@buffer_characters = true
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# @see #on_wf
|
105
|
+
#
|
106
|
+
def after_wf
|
107
|
+
wf = @stack.pop
|
108
|
+
wf.value = @characters
|
109
|
+
|
110
|
+
current_object.children << wf
|
111
|
+
|
112
|
+
@word_mapping[wf.id] = wf
|
113
|
+
|
114
|
+
reset_character_buffer
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Processes a `<term>` node.
|
119
|
+
#
|
120
|
+
# @param [Hash] attr
|
121
|
+
#
|
122
|
+
def on_term(attr)
|
123
|
+
@attributes << attr
|
124
|
+
|
125
|
+
@buffer_targets = true
|
126
|
+
end
|
127
|
+
|
128
|
+
##
|
129
|
+
# @see #on_term
|
130
|
+
#
|
131
|
+
def after_term
|
132
|
+
attrs, sentiment = @attributes
|
133
|
+
|
134
|
+
@targets.each do |target|
|
135
|
+
word = @word_mapping[target]
|
136
|
+
|
137
|
+
word.morphofeat = attrs['morphofeat']
|
138
|
+
word.word_type = attrs['type']
|
139
|
+
word.pos = attrs['pos']
|
140
|
+
|
141
|
+
if sentiment
|
142
|
+
word.sentiment_modifier = sentiment['sentiment_modifier']
|
143
|
+
word.polarity = sentiment['polarity']
|
144
|
+
end
|
145
|
+
|
146
|
+
# Map the term IDs to the word form node.
|
147
|
+
@term_mapping[attrs['tid']] = word
|
148
|
+
end
|
149
|
+
|
150
|
+
reset_target_buffer
|
151
|
+
reset_attributes_buffer
|
152
|
+
end
|
153
|
+
|
154
|
+
##
|
155
|
+
# Processes a `<target>` node.
|
156
|
+
#
|
157
|
+
# @param [Hash] attr
|
158
|
+
#
|
159
|
+
def on_target(attr)
|
160
|
+
@targets << attr['id'] if @buffer_targets
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# Processes a `<sentiment>` node.
|
165
|
+
#
|
166
|
+
# @param [Hash] attr
|
167
|
+
#
|
168
|
+
def on_sentiment(attr)
|
169
|
+
@attributes << attr
|
170
|
+
end
|
171
|
+
|
172
|
+
##
|
173
|
+
# Processes a `<opinion>` node.
|
174
|
+
#
|
175
|
+
# @param [Hash] attr
|
176
|
+
#
|
177
|
+
def on_opinion(attr)
|
178
|
+
@stack << AST::Opinion.new(:id => attr['oid'])
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# @see #on_opinion
|
183
|
+
#
|
184
|
+
def after_opinion
|
185
|
+
opinion = @stack.pop
|
186
|
+
remove = opinion.children.each_with_object({}) do |node, hash|
|
187
|
+
hash[node.id] = true
|
188
|
+
end
|
189
|
+
|
190
|
+
# Insert the opinion node before the first node of the expression.
|
191
|
+
first_index = current_object.children.index(opinion.children[0])
|
192
|
+
|
193
|
+
current_object.children.insert(first_index, opinion)
|
194
|
+
|
195
|
+
# Remove the word nodes from the current object since they have been
|
196
|
+
# moved into the opinion node.
|
197
|
+
current_object.children.each do |node|
|
198
|
+
if node.is_a?(AST::Text) and remove.key?(node.id)
|
199
|
+
current_object.children.delete(node)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
##
|
205
|
+
# @param [Hash] attr
|
206
|
+
#
|
207
|
+
def on_opinion_holder(attr)
|
208
|
+
@buffer_targets = true
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
# @see #on_opinion_holder
|
213
|
+
#
|
214
|
+
def after_opinion_holder
|
215
|
+
@targets.each do |target|
|
216
|
+
current_object.holder << @term_mapping[target]
|
217
|
+
end
|
218
|
+
|
219
|
+
reset_target_buffer
|
220
|
+
end
|
221
|
+
|
222
|
+
##
|
223
|
+
# @param [Hash] attr
|
224
|
+
#
|
225
|
+
def on_opinion_target(attr)
|
226
|
+
@buffer_targets = true
|
227
|
+
end
|
228
|
+
|
229
|
+
##
|
230
|
+
# @see #on_opinion_target
|
231
|
+
#
|
232
|
+
def after_opinion_target
|
233
|
+
@targets.each do |target|
|
234
|
+
current_object.target << @term_mapping[target]
|
235
|
+
end
|
236
|
+
|
237
|
+
reset_target_buffer
|
238
|
+
end
|
239
|
+
|
240
|
+
##
|
241
|
+
# Processes an `<opinion-expression>` node.
|
242
|
+
#
|
243
|
+
# @param [Hash] attr
|
244
|
+
#
|
245
|
+
def on_opinion_expression(attr)
|
246
|
+
current_object.polarity = attr['polarity']
|
247
|
+
current_object.strength = attr['strength'].to_i
|
248
|
+
|
249
|
+
@buffer_targets = true
|
250
|
+
end
|
251
|
+
|
252
|
+
##
|
253
|
+
# @see #on_opinion_expression
|
254
|
+
#
|
255
|
+
def after_opinion_expression
|
256
|
+
@targets.each do |target|
|
257
|
+
current_object.children << @term_mapping[target]
|
258
|
+
end
|
259
|
+
|
260
|
+
reset_target_buffer
|
261
|
+
end
|
262
|
+
|
263
|
+
##
|
264
|
+
# Processes a `<property>` node.
|
265
|
+
#
|
266
|
+
# @param [Hash] attr
|
267
|
+
#
|
268
|
+
def on_property(attr)
|
269
|
+
@attributes << attr
|
270
|
+
|
271
|
+
@buffer_targets = true
|
272
|
+
end
|
273
|
+
|
274
|
+
##
|
275
|
+
# @see #on_property
|
276
|
+
#
|
277
|
+
def after_property
|
278
|
+
attrs = @attributes.pop
|
279
|
+
|
280
|
+
@targets.each do |target|
|
281
|
+
@term_mapping[target].property = attrs['lemma']
|
282
|
+
end
|
283
|
+
|
284
|
+
reset_attributes_buffer
|
285
|
+
reset_target_buffer
|
286
|
+
end
|
287
|
+
|
288
|
+
private
|
289
|
+
|
290
|
+
##
|
291
|
+
# Returns a callback name for the given XML node name.
|
292
|
+
#
|
293
|
+
# @param [String] name
|
294
|
+
# @return [String]
|
295
|
+
#
|
296
|
+
def callback_name(name)
|
297
|
+
return name.gsub(/([^A-Z]+)([A-Z]+)/, '\\1_\\2').downcase
|
298
|
+
end
|
299
|
+
|
300
|
+
##
|
301
|
+
# @param [String] name
|
302
|
+
# @param [Array] args
|
303
|
+
#
|
304
|
+
def execute_callback(name, *args)
|
305
|
+
send(name, *args) if respond_to?(name)
|
306
|
+
end
|
307
|
+
|
308
|
+
##
|
309
|
+
# Converts an Array of attributes into a Hash.
|
310
|
+
#
|
311
|
+
# @param [Array] attributes
|
312
|
+
# @return [Hash]
|
313
|
+
#
|
314
|
+
def associate_attributes(attributes)
|
315
|
+
return attributes.each_with_object({}) do |pair, hash|
|
316
|
+
hash[pair[0]] = pair[1]
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
##
|
321
|
+
# @return [Mixed]
|
322
|
+
#
|
323
|
+
def current_object
|
324
|
+
return @stack.last
|
325
|
+
end
|
326
|
+
|
327
|
+
##
|
328
|
+
# Resets the character buffer and disables buffering.
|
329
|
+
#
|
330
|
+
def reset_character_buffer
|
331
|
+
@buffer_characters = false
|
332
|
+
@characters = ''
|
333
|
+
end
|
334
|
+
|
335
|
+
##
|
336
|
+
# Resets the target buffer and disables buffering.
|
337
|
+
#
|
338
|
+
def reset_target_buffer
|
339
|
+
@buffer_targets = false
|
340
|
+
@targets = []
|
341
|
+
end
|
342
|
+
|
343
|
+
##
|
344
|
+
# Resets the attributes buffer.
|
345
|
+
#
|
346
|
+
def reset_attributes_buffer
|
347
|
+
@attributes = []
|
348
|
+
end
|
349
|
+
end # SaxParser
|
350
|
+
end # KafParser
|
351
|
+
end # Opener
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path('../lib/opener/kaf_parser/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = 'opener-kaf-parser'
|
5
|
+
gem.version = Opener::KafParser::VERSION
|
6
|
+
gem.authors = ['Yorick Peterse <yorickpeterse@olery.com>']
|
7
|
+
gem.summary = 'A KAF parser written in Ruby.'
|
8
|
+
gem.description = gem.summary
|
9
|
+
gem.has_rdoc = 'yard'
|
10
|
+
|
11
|
+
gem.required_ruby_version = '>= 1.9.3'
|
12
|
+
|
13
|
+
gem.files = Dir.glob([
|
14
|
+
'doc/**/*',
|
15
|
+
'lib/**/*',
|
16
|
+
'LICENSE',
|
17
|
+
'*.gemspec',
|
18
|
+
'README.md'
|
19
|
+
]).select { |file| File.file?(file) }
|
20
|
+
|
21
|
+
gem.add_dependency 'nokogiri'
|
22
|
+
gem.add_dependency 'builder'
|
23
|
+
|
24
|
+
gem.add_development_dependency 'rspec'
|
25
|
+
gem.add_development_dependency 'rake'
|
26
|
+
gem.add_development_dependency 'simplecov'
|
27
|
+
gem.add_development_dependency 'yard'
|
28
|
+
gem.add_development_dependency 'redcarpet', ['>= 2.0']
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: opener-kaf-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yorick Peterse <yorickpeterse@olery.com>
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: builder
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: redcarpet
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.0'
|
111
|
+
description: A KAF parser written in Ruby.
|
112
|
+
email:
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- LICENSE
|
118
|
+
- README.md
|
119
|
+
- doc/css/common.css
|
120
|
+
- lib/opener/kaf_parser.rb
|
121
|
+
- lib/opener/kaf_parser/ast/base.rb
|
122
|
+
- lib/opener/kaf_parser/ast/document.rb
|
123
|
+
- lib/opener/kaf_parser/ast/opinion.rb
|
124
|
+
- lib/opener/kaf_parser/ast/text.rb
|
125
|
+
- lib/opener/kaf_parser/parser.rb
|
126
|
+
- lib/opener/kaf_parser/presenter/html.rb
|
127
|
+
- lib/opener/kaf_parser/presenter/text.rb
|
128
|
+
- lib/opener/kaf_parser/sax_parser.rb
|
129
|
+
- lib/opener/kaf_parser/version.rb
|
130
|
+
- opener-kaf-parser.gemspec
|
131
|
+
homepage:
|
132
|
+
licenses: []
|
133
|
+
metadata: {}
|
134
|
+
post_install_message:
|
135
|
+
rdoc_options: []
|
136
|
+
require_paths:
|
137
|
+
- lib
|
138
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 1.9.3
|
143
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ">="
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 2.2.2
|
151
|
+
signing_key:
|
152
|
+
specification_version: 4
|
153
|
+
summary: A KAF parser written in Ruby.
|
154
|
+
test_files: []
|
155
|
+
has_rdoc: yard
|