opener-kaf-parser 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +72 -0
- data/doc/css/common.css +68 -0
- data/lib/opener/kaf_parser.rb +16 -0
- data/lib/opener/kaf_parser/ast/base.rb +76 -0
- data/lib/opener/kaf_parser/ast/document.rb +33 -0
- data/lib/opener/kaf_parser/ast/opinion.rb +52 -0
- data/lib/opener/kaf_parser/ast/text.rb +75 -0
- data/lib/opener/kaf_parser/parser.rb +25 -0
- data/lib/opener/kaf_parser/presenter/html.rb +111 -0
- data/lib/opener/kaf_parser/presenter/text.rb +69 -0
- data/lib/opener/kaf_parser/sax_parser.rb +351 -0
- data/lib/opener/kaf_parser/version.rb +5 -0
- data/opener-kaf-parser.gemspec +29 -0
- metadata +155 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: efcd70b4613807928b75e36c34e96b8ec22737e1
|
4
|
+
data.tar.gz: 7fb60e7fb63d37192efd2e2938b2a503e712358d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b920bcb157bc0d1c5ce592ea7da6c454b89b2842dc055712d662f3a3fd8cdd762ecc3f61a4cfc15cc26e11e87ee701ab8a116734e92d13996d7b5efc0fa03d7c
|
7
|
+
data.tar.gz: cb85a15b2dc234e5d9abe33222f64131cbb6491a40ea243b3178065d73f30cb050c350cc9ce064a6fa7ac2c85d4fbf27bd1f9341971e3e47a417da9ce5912753
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2013, Olery
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
[![Build Status](https://drone.io/github.com/opener-project/ruby-kaf-parser/status.png)](https://drone.io/github.com/opener-project/ruby-kaf-parser/latest)
|
2
|
+
|
3
|
+
# Ruby KAF Parser
|
4
|
+
|
5
|
+
This repository contains the source code of the opener-kaf-parser, a simple and
|
6
|
+
fast KAF parser based on Nokogiri. The KAF parser is a stack based parser that
|
7
|
+
uses the SAX parsing API of Nokogiri, thus it should (in theory) be able to
|
8
|
+
handle large KAF files without too much trouble.
|
9
|
+
|
10
|
+
## Usage
|
11
|
+
|
12
|
+
Create a parser instance and parse some KAF:
|
13
|
+
|
14
|
+
require 'opener/kaf_parser'
|
15
|
+
|
16
|
+
parser = Opener::KafParser::Parser.new
|
17
|
+
ast = parser.parse('...')
|
18
|
+
|
19
|
+
The return value is a list of `Opener::KafParser::AST` nodes which behave like
|
20
|
+
S expressions (and are formatted that way when calling `#inspect` on them).
|
21
|
+
Currently there are 3 node types:
|
22
|
+
|
23
|
+
* document
|
24
|
+
* text
|
25
|
+
* opinion
|
26
|
+
|
27
|
+
The latter groups a set of text nodes together that make up the opinion.
|
28
|
+
|
29
|
+
To iterate over these nodes you'd do something along the lines of the
|
30
|
+
following:
|
31
|
+
|
32
|
+
ast.language # => "en"
|
33
|
+
|
34
|
+
ast.children.each do |node|
|
35
|
+
if node.type == :text
|
36
|
+
puts "Word: #{node.inspect}"
|
37
|
+
else
|
38
|
+
puts "Opinion: #{node.inspect}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
## Presenting Text
|
43
|
+
|
44
|
+
To present an AST/text you can use one of the standard presenter classes. For
|
45
|
+
example, if you want to turn an AST in a regular Ruby String you can use the
|
46
|
+
Text presenter:
|
47
|
+
|
48
|
+
ast = parser.parse('...')
|
49
|
+
presenter = Opener::KafParser::Presenter::Text.new
|
50
|
+
|
51
|
+
puts presenter.present(ast) # => "Hello, you are doing great"
|
52
|
+
|
53
|
+
Currently the following presenters are available:
|
54
|
+
|
55
|
+
* `Opener::KafParser::Presenter::Text`
|
56
|
+
* `Opener::KafParser::Presenter::HTML`
|
57
|
+
|
58
|
+
## Requirements
|
59
|
+
|
60
|
+
* Ruby 1.9.3 or newer
|
61
|
+
* libxml2 (newer versions of Nokogiri ship libxml themselves)
|
62
|
+
|
63
|
+
## Installation:
|
64
|
+
|
65
|
+
Installing as a Gem:
|
66
|
+
|
67
|
+
gem install opener-kaf-parser
|
68
|
+
|
69
|
+
Using Bundler:
|
70
|
+
|
71
|
+
gem 'opener-kaf-parser',
|
72
|
+
:git => 'git@github.com:opener-project/ruby-kaf-parser'
|
data/doc/css/common.css
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
body
|
2
|
+
{
|
3
|
+
font-size: 14px;
|
4
|
+
line-height: 1.6;
|
5
|
+
margin: 0 auto;
|
6
|
+
max-width: 960px;
|
7
|
+
}
|
8
|
+
|
9
|
+
p code
|
10
|
+
{
|
11
|
+
background: #f2f2f2;
|
12
|
+
padding-left: 3px;
|
13
|
+
padding-right: 3px;
|
14
|
+
}
|
15
|
+
|
16
|
+
pre.code
|
17
|
+
{
|
18
|
+
font-size: 13px;
|
19
|
+
line-height: 1.4;
|
20
|
+
}
|
21
|
+
|
22
|
+
/**
|
23
|
+
* YARD uses generic table styles, using a special class means those tables
|
24
|
+
* don't get messed up.
|
25
|
+
*/
|
26
|
+
.table
|
27
|
+
{
|
28
|
+
border: 1px solid #ccc;
|
29
|
+
border-right: none;
|
30
|
+
border-collapse: separate;
|
31
|
+
border-spacing: 0;
|
32
|
+
text-align: left;
|
33
|
+
}
|
34
|
+
|
35
|
+
.table.full
|
36
|
+
{
|
37
|
+
width: 100%;
|
38
|
+
}
|
39
|
+
|
40
|
+
.table .field_name
|
41
|
+
{
|
42
|
+
min-width: 160px;
|
43
|
+
}
|
44
|
+
|
45
|
+
.table thead tr th.no_sort:first-child
|
46
|
+
{
|
47
|
+
width: 25px;
|
48
|
+
}
|
49
|
+
|
50
|
+
.table thead tr th, .table tbody tr td
|
51
|
+
{
|
52
|
+
border-bottom: 1px solid #ccc;
|
53
|
+
border-right: 1px solid #ccc;
|
54
|
+
min-width: 20px;
|
55
|
+
padding: 8px 5px;
|
56
|
+
text-align: left;
|
57
|
+
vertical-align: top;
|
58
|
+
}
|
59
|
+
|
60
|
+
.table tbody tr:last-child td
|
61
|
+
{
|
62
|
+
border-bottom: none;
|
63
|
+
}
|
64
|
+
|
65
|
+
.table tr:nth-child(odd) td
|
66
|
+
{
|
67
|
+
background: #f9f9f9;
|
68
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'time'
|
3
|
+
require 'builder'
|
4
|
+
|
5
|
+
require_relative 'kaf_parser/version'
|
6
|
+
|
7
|
+
require_relative 'kaf_parser/ast/base'
|
8
|
+
require_relative 'kaf_parser/ast/document'
|
9
|
+
require_relative 'kaf_parser/ast/text'
|
10
|
+
require_relative 'kaf_parser/ast/opinion'
|
11
|
+
|
12
|
+
require_relative 'kaf_parser/sax_parser'
|
13
|
+
require_relative 'kaf_parser/parser'
|
14
|
+
|
15
|
+
require_relative 'kaf_parser/presenter/text'
|
16
|
+
require_relative 'kaf_parser/presenter/html'
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# Base node class that provides some common boilerplate for the various
|
6
|
+
# other node classes.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] type
|
9
|
+
# @return [Symbol]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] value
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] children
|
15
|
+
# @return [Array<Opener::KafParser::AST::Base>]
|
16
|
+
#
|
17
|
+
class Base
|
18
|
+
attr_accessor :type, :value, :children
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param [Hash] attributes
|
22
|
+
#
|
23
|
+
def initialize(attributes = {})
|
24
|
+
attributes.each do |key, value|
|
25
|
+
instance_variable_set("@#{key}", value) if respond_to?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
@children ||= []
|
29
|
+
@type ||= :generic
|
30
|
+
|
31
|
+
after_initialize if respond_to?(:after_initialize)
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# @return [String]
|
36
|
+
#
|
37
|
+
def inspect(indent = 0)
|
38
|
+
spaces = ' ' * indent
|
39
|
+
child_values = children.map { |c| c.inspect(indent + 2) }
|
40
|
+
segments = ["#{spaces}(#{type}"]
|
41
|
+
|
42
|
+
if value
|
43
|
+
segments << "#{value.inspect}"
|
44
|
+
end
|
45
|
+
|
46
|
+
unless child_values.empty?
|
47
|
+
segments << "\n#{child_values.join("\n")}"
|
48
|
+
end
|
49
|
+
|
50
|
+
return segments.join(' ') + ')'
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# @return [Hash]
|
55
|
+
#
|
56
|
+
def attributes
|
57
|
+
return {}
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# @return [TrueClass|FalseClass]
|
62
|
+
#
|
63
|
+
def text?
|
64
|
+
return type == :text
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# @return [TrueClass|FalseClass]
|
69
|
+
#
|
70
|
+
def opinion?
|
71
|
+
return type == :opinion
|
72
|
+
end
|
73
|
+
end # Base
|
74
|
+
end # AST
|
75
|
+
end # KafParser
|
76
|
+
end # Opener
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# The Document node class contains information about a `<KAF>` tag and
|
6
|
+
# all the child nodes.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] language
|
9
|
+
# @return [String]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] version
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
class Document < Base
|
15
|
+
attr_accessor :language, :version
|
16
|
+
|
17
|
+
##
|
18
|
+
# Called after a new instance of this class is created.
|
19
|
+
#
|
20
|
+
def after_initialize
|
21
|
+
@type = :document
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @return [Hash]
|
26
|
+
#
|
27
|
+
def attributes
|
28
|
+
return {:language => language, :version => version}
|
29
|
+
end
|
30
|
+
end # Document
|
31
|
+
end # AST
|
32
|
+
end # KafParser
|
33
|
+
end # Opener
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# The Opinion node class contains information about a opinion, the
|
6
|
+
# expression, polarity and more. The nodes that make up the expression of
|
7
|
+
# the opinion are stored in the `children` method.
|
8
|
+
#
|
9
|
+
# @!attribute [rw] id
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
# @!attribute [rw] holder The nodes that make up the opinion holder.
|
13
|
+
# @return [Array]
|
14
|
+
#
|
15
|
+
# @!attribute [rw] target The nodes that make up the opinion target.
|
16
|
+
# @return [Array]
|
17
|
+
#
|
18
|
+
# @!attribute [rw] polarity
|
19
|
+
# @return [String]
|
20
|
+
#
|
21
|
+
# @!attribute [rw] strength
|
22
|
+
# @return [Numeric]
|
23
|
+
#
|
24
|
+
class Opinion < Base
|
25
|
+
attr_accessor :id, :holder, :target, :polarity, :strength
|
26
|
+
|
27
|
+
##
|
28
|
+
# Called after a new instance of this class is created.
|
29
|
+
#
|
30
|
+
def after_initialize
|
31
|
+
@type = :opinion
|
32
|
+
|
33
|
+
@holder ||= []
|
34
|
+
@target ||= []
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# @return [Hash]
|
39
|
+
#
|
40
|
+
def attributes
|
41
|
+
return {
|
42
|
+
:id => id,
|
43
|
+
:holder => holder,
|
44
|
+
:target => target,
|
45
|
+
:polarity => polarity,
|
46
|
+
:strength => strength
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end # Opinion
|
50
|
+
end # AST
|
51
|
+
end # KafParser
|
52
|
+
end # Opener
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module AST
|
4
|
+
##
|
5
|
+
# Node class that contains information about a set of characters such as
|
6
|
+
# the polarity and POS.
|
7
|
+
#
|
8
|
+
# @!attribute [rw] id
|
9
|
+
# @return [Numeric]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] sentence
|
12
|
+
# @return [Numeric]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] paragraph
|
15
|
+
# @return [Numeric]
|
16
|
+
#
|
17
|
+
# @!attribute [rw] offset
|
18
|
+
# @return [Numeric]
|
19
|
+
#
|
20
|
+
# @!attribute [rw] length
|
21
|
+
# @return [Numeric]
|
22
|
+
#
|
23
|
+
# @!attribute [r] word_type
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
# @!attribute [r] pos
|
27
|
+
# @return [String]
|
28
|
+
#
|
29
|
+
# @!attribute [rw] morphofeat
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
# @!attribute [rw] sentiment_modifier
|
33
|
+
# @return [String]
|
34
|
+
#
|
35
|
+
# @!attribute [rw] polarity
|
36
|
+
# @return [String]
|
37
|
+
#
|
38
|
+
# @!attribute [rw] property
|
39
|
+
# @return [String]
|
40
|
+
#
|
41
|
+
class Text < Base
|
42
|
+
attr_accessor :id, :sentence, :paragraph, :offset, :length, :word_type,
|
43
|
+
:pos, :morphofeat, :sentiment_modifier, :polarity, :property
|
44
|
+
|
45
|
+
##
|
46
|
+
# Called after a new instance of this class is created.
|
47
|
+
#
|
48
|
+
def after_initialize
|
49
|
+
@type = :text
|
50
|
+
|
51
|
+
@length ||= value.length
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [Hash]
|
56
|
+
#
|
57
|
+
def attributes
|
58
|
+
return {
|
59
|
+
:id => id,
|
60
|
+
:sentence => sentence,
|
61
|
+
:paragraph => paragraph,
|
62
|
+
:offset => offset,
|
63
|
+
:length => length,
|
64
|
+
:word_type => word_type,
|
65
|
+
:pos => pos,
|
66
|
+
:morphofeat => morphofeat,
|
67
|
+
:sentiment_modifier => sentiment_modifier,
|
68
|
+
:polarity => polarity,
|
69
|
+
:property => property
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end # Generic
|
73
|
+
end # AST
|
74
|
+
end # KafParser
|
75
|
+
end # Opener
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
##
|
4
|
+
# The Parser class acts as a slightly more user friendly interface around
|
5
|
+
# the Nokogiri SAX based parser.
|
6
|
+
#
|
7
|
+
class Parser
|
8
|
+
##
|
9
|
+
# Parses the input KAF/XML and returns an instance of
|
10
|
+
# {Opener::KafParser::AST::Document}.
|
11
|
+
#
|
12
|
+
# @param [String] input The XML/KAF to parse.
|
13
|
+
# @return [Opener::KafParser::AST::Document]
|
14
|
+
#
|
15
|
+
def parse(input)
|
16
|
+
sax_parser = SaxParser.new
|
17
|
+
nokogiri_parser = Nokogiri::XML::SAX::Parser.new(sax_parser)
|
18
|
+
|
19
|
+
nokogiri_parser.parse(input)
|
20
|
+
|
21
|
+
return sax_parser.document
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # KafParser
|
25
|
+
end # Opener
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module Presenter
|
4
|
+
##
|
5
|
+
# The HTML presenter takes an AST and turns it into a block of HTML where
|
6
|
+
# each word is wrapped in a tag and has various meta information (e.g.
|
7
|
+
# the polarity) assigned to it.
|
8
|
+
#
|
9
|
+
# Basic usage:
|
10
|
+
#
|
11
|
+
# parser = Opener::KafParser::Parser.new
|
12
|
+
# ast = parser.parse('...')
|
13
|
+
# presenter = Opener::KafParser::Presenter::HTML.new
|
14
|
+
#
|
15
|
+
# puts presenter.present(ast)
|
16
|
+
#
|
17
|
+
# ## Output
|
18
|
+
#
|
19
|
+
# The output is a set of span tags for each set of characters, span tags
|
20
|
+
# for whitespace and a set of span tags that group opinion expressions.
|
21
|
+
# Each span tag has a class indicating the type ("text", "opinion", etc)
|
22
|
+
# and a set of `data-*` attributes containing data such as the polarity.
|
23
|
+
# For example, the ID of a text node would be stored in `data-id`, the
|
24
|
+
# polarity in `data-polarity` and so forth.
|
25
|
+
#
|
26
|
+
class HTML < Text
|
27
|
+
##
|
28
|
+
# @return [String]
|
29
|
+
#
|
30
|
+
SPACE = ' '
|
31
|
+
|
32
|
+
##
|
33
|
+
# @return [Array]
|
34
|
+
#
|
35
|
+
TYPES_WHITELIST = [String, Numeric]
|
36
|
+
|
37
|
+
##
|
38
|
+
# Presents the AST as a collection of HTML tags.
|
39
|
+
#
|
40
|
+
# @param [Opener::KafParser::AST::Base] ast
|
41
|
+
# @return [String]
|
42
|
+
#
|
43
|
+
def present(ast)
|
44
|
+
offset = 0
|
45
|
+
builder = Builder::XmlMarkup.new
|
46
|
+
|
47
|
+
render_ast(ast, offset, builder)
|
48
|
+
|
49
|
+
return builder.target!
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
##
|
55
|
+
# @param [Opener::KafParser::AST::Base] ast
|
56
|
+
# @param [Numeric] offset
|
57
|
+
# @param [Builder::XmlMarkup] builder
|
58
|
+
#
|
59
|
+
def render_ast(ast, offset, builder)
|
60
|
+
ast.children.each do |node|
|
61
|
+
if node.text?
|
62
|
+
offset = render_node(node, offset, builder)
|
63
|
+
else
|
64
|
+
render_span(node, builder) do |sub_builder|
|
65
|
+
render_ast(node, offset, builder)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# @see #render_ast
|
73
|
+
#
|
74
|
+
def render_node(node, offset, builder)
|
75
|
+
diff = node.offset - offset
|
76
|
+
|
77
|
+
if diff > 0
|
78
|
+
builder.span(:class => 'whitespace') do |sub_builder|
|
79
|
+
sub_builder << SPACE * diff
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
render_span(node, builder)
|
84
|
+
|
85
|
+
return calculate_offset(node)
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# @param [Opener::KafParser::AST::Base] node
|
90
|
+
# @param [Builder::XmlMarkup] builder
|
91
|
+
#
|
92
|
+
def render_span(node, builder)
|
93
|
+
attrs = {'class' => node.type}
|
94
|
+
|
95
|
+
# Only store simple values in the HTML attributes.
|
96
|
+
node.attributes.each do |key, value|
|
97
|
+
if TYPES_WHITELIST.include?(value.class)
|
98
|
+
attrs["data-#{key}"] = value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
if block_given?
|
103
|
+
builder.span(node.value, attrs) { |sub_builder| yield sub_builder }
|
104
|
+
else
|
105
|
+
builder.span(node.value, attrs)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end # HTML
|
109
|
+
end # Presenter
|
110
|
+
end # KafParser
|
111
|
+
end # Opener
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
module Presenter
|
4
|
+
##
|
5
|
+
# The Text presenter class takes an AST and builds a plain Ruby string
|
6
|
+
# containing the correct whitespace between various nodes.
|
7
|
+
#
|
8
|
+
class Text
|
9
|
+
##
|
10
|
+
# Presents the AST as a plain Ruby String with no special formatting.
|
11
|
+
#
|
12
|
+
# @param [Opener::KafParser::AST::Base] ast
|
13
|
+
# @return [String]
|
14
|
+
#
|
15
|
+
def present(ast)
|
16
|
+
offset = 0
|
17
|
+
buffer = ''
|
18
|
+
|
19
|
+
render_ast(ast, offset, buffer)
|
20
|
+
|
21
|
+
return buffer
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
##
|
27
|
+
# @param [Opener::KafParser::AST::Base] ast
|
28
|
+
# @param [Numeric] offset
|
29
|
+
# @param [String] buffer
|
30
|
+
#
|
31
|
+
def render_ast(ast, offset, buffer)
|
32
|
+
ast.children.each do |node|
|
33
|
+
if node.text?
|
34
|
+
offset = render_node(node, offset, buffer)
|
35
|
+
else
|
36
|
+
render_ast(node, offset, buffer)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# @param [Opener::KafParser::AST::Text] node
|
43
|
+
# @param [Numeric] offset
|
44
|
+
# @param [String] buffer
|
45
|
+
# @return [Numeric]
|
46
|
+
#
|
47
|
+
def render_node(node, offset, buffer)
|
48
|
+
diff = node.offset - offset
|
49
|
+
|
50
|
+
if diff > 0
|
51
|
+
buffer << ' ' * diff
|
52
|
+
end
|
53
|
+
|
54
|
+
buffer << node.value
|
55
|
+
|
56
|
+
return calculate_offset(node)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# @param [Opener::KafParser::AST::Text] node
|
61
|
+
# @return [Numeric]
|
62
|
+
#
|
63
|
+
def calculate_offset(node)
|
64
|
+
return node.offset + node.length
|
65
|
+
end
|
66
|
+
end # Text
|
67
|
+
end # Presenter
|
68
|
+
end # KafParser
|
69
|
+
end # Opener
|
@@ -0,0 +1,351 @@
|
|
1
|
+
module Opener
|
2
|
+
module KafParser
|
3
|
+
##
|
4
|
+
# The SaxParser class is a Nokogiri SAX parser that builds a list of
|
5
|
+
# {Opener::KafParser::AST::Base} nodes containing word information such as
|
6
|
+
# the polarity and Part Of Speech as well as grouping words together based
|
7
|
+
# on the opinion expression they belong to.
|
8
|
+
#
|
9
|
+
# This SAX parser is a stack based parser and parses only relevant
|
10
|
+
# information of KAF documents. For example, the `<head>` of a KAF document
|
11
|
+
# is completely ignored.
|
12
|
+
#
|
13
|
+
# @!attribute [r] document
|
14
|
+
# @return [Opener::KafParser::Element::Document]
|
15
|
+
#
|
16
|
+
class SaxParser < Nokogiri::XML::SAX::Document
|
17
|
+
attr_reader :document
|
18
|
+
|
19
|
+
##
|
20
|
+
# @see Nokogiri::XML::SAX::Document#initialize
|
21
|
+
#
|
22
|
+
def initialize(*args)
|
23
|
+
super
|
24
|
+
|
25
|
+
@stack = []
|
26
|
+
@attributes = []
|
27
|
+
@document = nil
|
28
|
+
@characters = ''
|
29
|
+
@targets = []
|
30
|
+
@buffer_characters = false
|
31
|
+
@buffer_targets = false
|
32
|
+
@word_mapping = {}
|
33
|
+
@term_mapping = {}
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Called at the start of an XML element. This method delegates the work
|
38
|
+
# to individual method calls based on the node name.
|
39
|
+
#
|
40
|
+
# @param [String] name The name of the element.
|
41
|
+
# @param [Array] attributes
|
42
|
+
#
|
43
|
+
def start_element(name, attributes)
|
44
|
+
callback = 'on_' + callback_name(name)
|
45
|
+
attributes = associate_attributes(attributes)
|
46
|
+
|
47
|
+
execute_callback(callback, attributes)
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# @param [String] name The name of the element.
|
52
|
+
#
|
53
|
+
def end_element(name)
|
54
|
+
callback = 'after_' + callback_name(name)
|
55
|
+
|
56
|
+
execute_callback(callback)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# Processes the characters of an XML node.
|
61
|
+
#
|
62
|
+
# @param [String] text
|
63
|
+
#
|
64
|
+
def characters(text)
|
65
|
+
@characters << text if @buffer_characters
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Processes a `<KAF>` node.
|
70
|
+
#
|
71
|
+
# @param [Hash] attr
|
72
|
+
#
|
73
|
+
def on_kaf(attr)
|
74
|
+
@stack << AST::Document.new(
|
75
|
+
:language => attr.fetch('xml:lang', 'en'),
|
76
|
+
:version => attr['version']
|
77
|
+
)
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# @see #on_kaf
|
82
|
+
#
|
83
|
+
def after_kaf
|
84
|
+
@document = @stack.pop
|
85
|
+
end
|
86
|
+
##
|
87
|
+
# Processes a `<wf>` node.
|
88
|
+
#
|
89
|
+
# @param [Hash] attr
|
90
|
+
#
|
91
|
+
def on_wf(attr)
|
92
|
+
@stack << AST::Text.new(
|
93
|
+
:id => attr['wid'],
|
94
|
+
:sentence => attr['sent'].to_i,
|
95
|
+
:offset => attr['offset'].to_i,
|
96
|
+
:length => attr['length'].to_i,
|
97
|
+
:paragraph => attr['para'].to_i
|
98
|
+
)
|
99
|
+
|
100
|
+
@buffer_characters = true
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# @see #on_wf
|
105
|
+
#
|
106
|
+
def after_wf
|
107
|
+
wf = @stack.pop
|
108
|
+
wf.value = @characters
|
109
|
+
|
110
|
+
current_object.children << wf
|
111
|
+
|
112
|
+
@word_mapping[wf.id] = wf
|
113
|
+
|
114
|
+
reset_character_buffer
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Processes a `<term>` node.
|
119
|
+
#
|
120
|
+
# @param [Hash] attr
|
121
|
+
#
|
122
|
+
def on_term(attr)
|
123
|
+
@attributes << attr
|
124
|
+
|
125
|
+
@buffer_targets = true
|
126
|
+
end
|
127
|
+
|
128
|
+
##
|
129
|
+
# @see #on_term
|
130
|
+
#
|
131
|
+
def after_term
|
132
|
+
attrs, sentiment = @attributes
|
133
|
+
|
134
|
+
@targets.each do |target|
|
135
|
+
word = @word_mapping[target]
|
136
|
+
|
137
|
+
word.morphofeat = attrs['morphofeat']
|
138
|
+
word.word_type = attrs['type']
|
139
|
+
word.pos = attrs['pos']
|
140
|
+
|
141
|
+
if sentiment
|
142
|
+
word.sentiment_modifier = sentiment['sentiment_modifier']
|
143
|
+
word.polarity = sentiment['polarity']
|
144
|
+
end
|
145
|
+
|
146
|
+
# Map the term IDs to the word form node.
|
147
|
+
@term_mapping[attrs['tid']] = word
|
148
|
+
end
|
149
|
+
|
150
|
+
reset_target_buffer
|
151
|
+
reset_attributes_buffer
|
152
|
+
end
|
153
|
+
|
154
|
+
##
|
155
|
+
# Processes a `<target>` node.
|
156
|
+
#
|
157
|
+
# @param [Hash] attr
|
158
|
+
#
|
159
|
+
def on_target(attr)
|
160
|
+
@targets << attr['id'] if @buffer_targets
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# Processes a `<sentiment>` node.
|
165
|
+
#
|
166
|
+
# @param [Hash] attr
|
167
|
+
#
|
168
|
+
def on_sentiment(attr)
|
169
|
+
@attributes << attr
|
170
|
+
end
|
171
|
+
|
172
|
+
##
|
173
|
+
# Processes a `<opinion>` node.
|
174
|
+
#
|
175
|
+
# @param [Hash] attr
|
176
|
+
#
|
177
|
+
def on_opinion(attr)
|
178
|
+
@stack << AST::Opinion.new(:id => attr['oid'])
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# @see #on_opinion
|
183
|
+
#
|
184
|
+
def after_opinion
|
185
|
+
opinion = @stack.pop
|
186
|
+
remove = opinion.children.each_with_object({}) do |node, hash|
|
187
|
+
hash[node.id] = true
|
188
|
+
end
|
189
|
+
|
190
|
+
# Insert the opinion node before the first node of the expression.
|
191
|
+
first_index = current_object.children.index(opinion.children[0])
|
192
|
+
|
193
|
+
current_object.children.insert(first_index, opinion)
|
194
|
+
|
195
|
+
# Remove the word nodes from the current object since they have been
|
196
|
+
# moved into the opinion node.
|
197
|
+
current_object.children.each do |node|
|
198
|
+
if node.is_a?(AST::Text) and remove.key?(node.id)
|
199
|
+
current_object.children.delete(node)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
##
|
205
|
+
# @param [Hash] attr
|
206
|
+
#
|
207
|
+
def on_opinion_holder(attr)
|
208
|
+
@buffer_targets = true
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
# @see #on_opinion_holder
|
213
|
+
#
|
214
|
+
def after_opinion_holder
|
215
|
+
@targets.each do |target|
|
216
|
+
current_object.holder << @term_mapping[target]
|
217
|
+
end
|
218
|
+
|
219
|
+
reset_target_buffer
|
220
|
+
end
|
221
|
+
|
222
|
+
##
|
223
|
+
# @param [Hash] attr
|
224
|
+
#
|
225
|
+
def on_opinion_target(attr)
|
226
|
+
@buffer_targets = true
|
227
|
+
end
|
228
|
+
|
229
|
+
##
|
230
|
+
# @see #on_opinion_target
|
231
|
+
#
|
232
|
+
def after_opinion_target
|
233
|
+
@targets.each do |target|
|
234
|
+
current_object.target << @term_mapping[target]
|
235
|
+
end
|
236
|
+
|
237
|
+
reset_target_buffer
|
238
|
+
end
|
239
|
+
|
240
|
+
##
|
241
|
+
# Processes an `<opinion-expression>` node.
|
242
|
+
#
|
243
|
+
# @param [Hash] attr
|
244
|
+
#
|
245
|
+
def on_opinion_expression(attr)
|
246
|
+
current_object.polarity = attr['polarity']
|
247
|
+
current_object.strength = attr['strength'].to_i
|
248
|
+
|
249
|
+
@buffer_targets = true
|
250
|
+
end
|
251
|
+
|
252
|
+
##
|
253
|
+
# @see #on_opinion_expression
|
254
|
+
#
|
255
|
+
def after_opinion_expression
|
256
|
+
@targets.each do |target|
|
257
|
+
current_object.children << @term_mapping[target]
|
258
|
+
end
|
259
|
+
|
260
|
+
reset_target_buffer
|
261
|
+
end
|
262
|
+
|
263
|
+
##
|
264
|
+
# Processes a `<property>` node.
|
265
|
+
#
|
266
|
+
# @param [Hash] attr
|
267
|
+
#
|
268
|
+
def on_property(attr)
|
269
|
+
@attributes << attr
|
270
|
+
|
271
|
+
@buffer_targets = true
|
272
|
+
end
|
273
|
+
|
274
|
+
##
|
275
|
+
# @see #on_property
|
276
|
+
#
|
277
|
+
def after_property
|
278
|
+
attrs = @attributes.pop
|
279
|
+
|
280
|
+
@targets.each do |target|
|
281
|
+
@term_mapping[target].property = attrs['lemma']
|
282
|
+
end
|
283
|
+
|
284
|
+
reset_attributes_buffer
|
285
|
+
reset_target_buffer
|
286
|
+
end
|
287
|
+
|
288
|
+
private
|
289
|
+
|
290
|
+
##
|
291
|
+
# Returns a callback name for the given XML node name.
|
292
|
+
#
|
293
|
+
# @param [String] name
|
294
|
+
# @return [String]
|
295
|
+
#
|
296
|
+
def callback_name(name)
|
297
|
+
return name.gsub(/([^A-Z]+)([A-Z]+)/, '\\1_\\2').downcase
|
298
|
+
end
|
299
|
+
|
300
|
+
##
|
301
|
+
# @param [String] name
|
302
|
+
# @param [Array] args
|
303
|
+
#
|
304
|
+
def execute_callback(name, *args)
|
305
|
+
send(name, *args) if respond_to?(name)
|
306
|
+
end
|
307
|
+
|
308
|
+
##
|
309
|
+
# Converts an Array of attributes into a Hash.
|
310
|
+
#
|
311
|
+
# @param [Array] attributes
|
312
|
+
# @return [Hash]
|
313
|
+
#
|
314
|
+
def associate_attributes(attributes)
|
315
|
+
return attributes.each_with_object({}) do |pair, hash|
|
316
|
+
hash[pair[0]] = pair[1]
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
##
|
321
|
+
# @return [Mixed]
|
322
|
+
#
|
323
|
+
def current_object
|
324
|
+
return @stack.last
|
325
|
+
end
|
326
|
+
|
327
|
+
##
|
328
|
+
# Resets the character buffer and disables buffering.
|
329
|
+
#
|
330
|
+
def reset_character_buffer
|
331
|
+
@buffer_characters = false
|
332
|
+
@characters = ''
|
333
|
+
end
|
334
|
+
|
335
|
+
##
|
336
|
+
# Resets the target buffer and disables buffering.
|
337
|
+
#
|
338
|
+
def reset_target_buffer
|
339
|
+
@buffer_targets = false
|
340
|
+
@targets = []
|
341
|
+
end
|
342
|
+
|
343
|
+
##
|
344
|
+
# Resets the attributes buffer.
|
345
|
+
#
|
346
|
+
def reset_attributes_buffer
|
347
|
+
@attributes = []
|
348
|
+
end
|
349
|
+
end # SaxParser
|
350
|
+
end # KafParser
|
351
|
+
end # Opener
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path('../lib/opener/kaf_parser/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = 'opener-kaf-parser'
|
5
|
+
gem.version = Opener::KafParser::VERSION
|
6
|
+
gem.authors = ['Yorick Peterse <yorickpeterse@olery.com>']
|
7
|
+
gem.summary = 'A KAF parser written in Ruby.'
|
8
|
+
gem.description = gem.summary
|
9
|
+
gem.has_rdoc = 'yard'
|
10
|
+
|
11
|
+
gem.required_ruby_version = '>= 1.9.3'
|
12
|
+
|
13
|
+
gem.files = Dir.glob([
|
14
|
+
'doc/**/*',
|
15
|
+
'lib/**/*',
|
16
|
+
'LICENSE',
|
17
|
+
'*.gemspec',
|
18
|
+
'README.md'
|
19
|
+
]).select { |file| File.file?(file) }
|
20
|
+
|
21
|
+
gem.add_dependency 'nokogiri'
|
22
|
+
gem.add_dependency 'builder'
|
23
|
+
|
24
|
+
gem.add_development_dependency 'rspec'
|
25
|
+
gem.add_development_dependency 'rake'
|
26
|
+
gem.add_development_dependency 'simplecov'
|
27
|
+
gem.add_development_dependency 'yard'
|
28
|
+
gem.add_development_dependency 'redcarpet', ['>= 2.0']
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: opener-kaf-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yorick Peterse <yorickpeterse@olery.com>
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: builder
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: redcarpet
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.0'
|
111
|
+
description: A KAF parser written in Ruby.
|
112
|
+
email:
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- LICENSE
|
118
|
+
- README.md
|
119
|
+
- doc/css/common.css
|
120
|
+
- lib/opener/kaf_parser.rb
|
121
|
+
- lib/opener/kaf_parser/ast/base.rb
|
122
|
+
- lib/opener/kaf_parser/ast/document.rb
|
123
|
+
- lib/opener/kaf_parser/ast/opinion.rb
|
124
|
+
- lib/opener/kaf_parser/ast/text.rb
|
125
|
+
- lib/opener/kaf_parser/parser.rb
|
126
|
+
- lib/opener/kaf_parser/presenter/html.rb
|
127
|
+
- lib/opener/kaf_parser/presenter/text.rb
|
128
|
+
- lib/opener/kaf_parser/sax_parser.rb
|
129
|
+
- lib/opener/kaf_parser/version.rb
|
130
|
+
- opener-kaf-parser.gemspec
|
131
|
+
homepage:
|
132
|
+
licenses: []
|
133
|
+
metadata: {}
|
134
|
+
post_install_message:
|
135
|
+
rdoc_options: []
|
136
|
+
require_paths:
|
137
|
+
- lib
|
138
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 1.9.3
|
143
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ">="
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 2.2.2
|
151
|
+
signing_key:
|
152
|
+
specification_version: 4
|
153
|
+
summary: A KAF parser written in Ruby.
|
154
|
+
test_files: []
|
155
|
+
has_rdoc: yard
|