opener-kaf-naf-parser 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +67 -8
- data/bin/kaf-naf-parser-daemon +10 -0
- data/core/kaf-naf-parser.py +5 -5
- data/exec/kaf-naf-parser.rb +9 -0
- data/ext/hack/Rakefile +13 -0
- data/lib/opener/kaf_naf_parser/version.rb +1 -1
- data/opener-kaf-naf-parser.gemspec +5 -1
- data/pre_install_requirements.txt +3 -0
- metadata +37 -51
- data/core/packages/KafNafParser-1.2.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.3.tar.gz +0 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +0 -338
- data/core/site-packages/pre_build/KafNafParser/__init__.py +0 -14
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +0 -125
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +0 -52
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +0 -80
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +0 -59
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +0 -41
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +0 -2
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +0 -205
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +0 -300
- data/core/site-packages/pre_build/KafNafParser/features_data.py +0 -71
- data/core/site-packages/pre_build/KafNafParser/header_data.py +0 -127
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +0 -200
- data/core/site-packages/pre_build/KafNafParser/references_data.py +0 -15
- data/core/site-packages/pre_build/KafNafParser/span_data.py +0 -63
- data/core/site-packages/pre_build/KafNafParser/term_data.py +0 -111
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +0 -42
- data/core/site-packages/pre_build/KafNafParser/text_data.py +0 -90
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/SOURCES.txt +0 -22
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/installed-files.txt +0 -47
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +0 -28
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +0 -156
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +0 -121
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +0 -72
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/SOURCES.txt +0 -14
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/installed-files.txt +0 -23
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +0 -165
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +0 -439
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +0 -7
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +0 -7
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +0 -11
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +0 -1
- data/pre_build_requirements.txt +0 -3
@@ -1,15 +0,0 @@
|
|
1
|
-
# Modified for NAF/KAf
|
2
|
-
from span_data import *
|
3
|
-
|
4
|
-
class Creferences:
|
5
|
-
def __init__(self,node=None):
|
6
|
-
self.type = 'NAF/KAF'
|
7
|
-
if node is None:
|
8
|
-
self.node = etree.Element('references')
|
9
|
-
else:
|
10
|
-
self.node = node
|
11
|
-
|
12
|
-
def __iter__(self):
|
13
|
-
for span_node in self.node.findall('span'):
|
14
|
-
yield Cspan(span_node)
|
15
|
-
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# Modified for KAF/NAF
|
2
|
-
|
3
|
-
from lxml import etree
|
4
|
-
from lxml.objectify import dump
|
5
|
-
|
6
|
-
class Ctarget:
|
7
|
-
def __init__(self,node=None):
|
8
|
-
self.type = 'NAF/KAF'
|
9
|
-
if node is None:
|
10
|
-
self.node = etree.Element('target')
|
11
|
-
else:
|
12
|
-
self.node = node
|
13
|
-
|
14
|
-
def get_id(self):
|
15
|
-
return self.node.get('id')
|
16
|
-
|
17
|
-
def set_id(self,this_id):
|
18
|
-
self.node.set('id',this_id)
|
19
|
-
|
20
|
-
def get_node(self):
|
21
|
-
return self.node
|
22
|
-
|
23
|
-
|
24
|
-
class Cspan:
|
25
|
-
def __init__(self,node=None):
|
26
|
-
self.type = 'NAF/KAF'
|
27
|
-
if node is None:
|
28
|
-
self.node = etree.Element('span')
|
29
|
-
else:
|
30
|
-
self.node = node
|
31
|
-
|
32
|
-
def add_target_id(self,this_id):
|
33
|
-
new_target = Ctarget()
|
34
|
-
new_target.set_id(this_id)
|
35
|
-
self.node.append(new_target.get_node())
|
36
|
-
|
37
|
-
def create_from_ids(self,list_ids):
|
38
|
-
for this_id in list_ids:
|
39
|
-
new_target = Ctarget()
|
40
|
-
new_target.set_id(this_id)
|
41
|
-
self.node.append(new_target.get_node())
|
42
|
-
|
43
|
-
def add_target(self,target):
|
44
|
-
self.node.append(target.get_node())
|
45
|
-
|
46
|
-
|
47
|
-
def __get_target_nodes(self):
|
48
|
-
for target_node in self.node.findall('target'):
|
49
|
-
yield target_node
|
50
|
-
|
51
|
-
def __iter__(self):
|
52
|
-
for target_node in self.__get_target_nodes():
|
53
|
-
yield Ctarget(target_node)
|
54
|
-
|
55
|
-
def get_span_ids(self):
|
56
|
-
return [t_obj.get_id() for t_obj in self]
|
57
|
-
|
58
|
-
def __str__(self):
|
59
|
-
return dump(self.node)
|
60
|
-
|
61
|
-
def get_node(self):
|
62
|
-
return self.node
|
63
|
-
|
@@ -1,111 +0,0 @@
|
|
1
|
-
# included code for NAF/KAF
|
2
|
-
|
3
|
-
|
4
|
-
from span_data import *
|
5
|
-
from external_references_data import *
|
6
|
-
from term_sentiment_data import *
|
7
|
-
from lxml import etree
|
8
|
-
|
9
|
-
|
10
|
-
class Cterm:
|
11
|
-
def __init__(self,node=None,type='NAF'):
|
12
|
-
self.type = type
|
13
|
-
if node is None:
|
14
|
-
self.node = etree.Element('term')
|
15
|
-
else:
|
16
|
-
self.node = node
|
17
|
-
|
18
|
-
|
19
|
-
def get_id(self):
|
20
|
-
if self.type == 'NAF':
|
21
|
-
return self.node.get('id')
|
22
|
-
elif self.type == 'KAF':
|
23
|
-
return self.node.get('tid')
|
24
|
-
|
25
|
-
def get_lemma(self):
|
26
|
-
return self.node.get('lemma')
|
27
|
-
|
28
|
-
def get_pos(self):
|
29
|
-
return self.node.get('pos')
|
30
|
-
|
31
|
-
def get_morphofeat(self):
|
32
|
-
return self.node.get('morphofeat')
|
33
|
-
|
34
|
-
def get_span(self):
|
35
|
-
node_span = self.node.find('span')
|
36
|
-
if node_span is not None:
|
37
|
-
return Cspan(node_span)
|
38
|
-
else:
|
39
|
-
return None
|
40
|
-
|
41
|
-
def get_sentiment(self):
|
42
|
-
sent_node = self.node.find('sentiment')
|
43
|
-
|
44
|
-
if sent_node is None:
|
45
|
-
return None
|
46
|
-
else:
|
47
|
-
return Cterm_sentiment(sent_node)
|
48
|
-
|
49
|
-
|
50
|
-
def add_external_reference(self,ext_ref):
|
51
|
-
ext_refs_node = self.node.find('externalReferences')
|
52
|
-
if ext_refs_node is None:
|
53
|
-
ext_refs_obj = CexternalReferences()
|
54
|
-
self.node.append(ext_refs_obj.get_node())
|
55
|
-
else:
|
56
|
-
ext_refs_obj = CexternalReferences(ext_refs_node)
|
57
|
-
|
58
|
-
ext_refs_obj.add_external_reference(ext_ref)
|
59
|
-
|
60
|
-
def add_term_sentiment(self,term_sentiment):
|
61
|
-
self.node.append(term_sentiment.get_node())
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
class Cterms:
|
67
|
-
def __init__(self,node=None,type='NAF'):
|
68
|
-
self.idx = {}
|
69
|
-
self.type = type
|
70
|
-
if node is None:
|
71
|
-
self.node = etree.Element('terms')
|
72
|
-
else:
|
73
|
-
self.node = node
|
74
|
-
for node_term in self.__get_node_terms():
|
75
|
-
self.idx[node_term.get('id')] = node_term
|
76
|
-
|
77
|
-
def get_node(self):
|
78
|
-
return self.node
|
79
|
-
|
80
|
-
def to_kaf(self):
|
81
|
-
if self.type == 'NAF':
|
82
|
-
self.type = 'KAF'
|
83
|
-
for node in self.__get_node_terms():
|
84
|
-
node.set('tid',node.get('id'))
|
85
|
-
del node.attrib['id']
|
86
|
-
|
87
|
-
def to_naf(self):
|
88
|
-
if self.type == 'KAF':
|
89
|
-
self.type = 'NAF'
|
90
|
-
for node in self.__get_node_terms():
|
91
|
-
node.set('id',node.get('tid'))
|
92
|
-
del node.attrib['tid']
|
93
|
-
|
94
|
-
def __get_node_terms(self):
|
95
|
-
for node_term in self.node.findall('term'):
|
96
|
-
yield node_term
|
97
|
-
|
98
|
-
def __iter__(self):
|
99
|
-
for node_term in self.__get_node_terms():
|
100
|
-
yield Cterm(node_term,self.type)
|
101
|
-
|
102
|
-
def get_term(self,term_id):
|
103
|
-
if term_id in self.idx:
|
104
|
-
return Cterm(self.idx[term_id],self.type)
|
105
|
-
else:
|
106
|
-
return None
|
107
|
-
|
108
|
-
def add_external_reference(self,term_id, external_ref):
|
109
|
-
if term_id in self.idx:
|
110
|
-
term_obj = Cterm(self.idx[term_id],self.type)
|
111
|
-
term_obj.add_external_reference(external_ref)
|
@@ -1,42 +0,0 @@
|
|
1
|
-
# Modified for NAF KAF
|
2
|
-
from lxml import etree
|
3
|
-
from lxml.objectify import dump
|
4
|
-
|
5
|
-
class Cterm_sentiment:
|
6
|
-
def __init__(self,node=None):
|
7
|
-
self.type = 'NAF/KAF'
|
8
|
-
if node is None:
|
9
|
-
self.node = etree.Element('sentiment')
|
10
|
-
else:
|
11
|
-
self.node = node
|
12
|
-
#self.resource = self.polarity = self.strength = self.subjectivity = self.semantic_type = self.modifier = self.marker = self.product_feature = ''
|
13
|
-
#if node is not None:
|
14
|
-
# self.resource = node.get('resource','')
|
15
|
-
# self.polarity = node.get('polarity','')
|
16
|
-
# self.strength = node.get('strength','')
|
17
|
-
# self.subjectivity = node.get('subjectivity','')
|
18
|
-
#self.semantic_type = node.get('sentiment_semantic_type','')
|
19
|
-
# self.modifier = node.get('sentiment modifier','')
|
20
|
-
# self.marker = node.get('sentiment_marker','')
|
21
|
-
# self.product_feature = node.get('sentiment product feature','')
|
22
|
-
|
23
|
-
def set_resource(self,r):
|
24
|
-
self.node.set('resource',r)
|
25
|
-
|
26
|
-
def get_node(self):
|
27
|
-
return self.node
|
28
|
-
|
29
|
-
def get_polarity(self):
|
30
|
-
return self.node.get('polarity')
|
31
|
-
|
32
|
-
def set_polarity(self,p):
|
33
|
-
self.node.set('polarity',p)
|
34
|
-
|
35
|
-
def get_modifier(self):
|
36
|
-
return self.node.get('sentiment_modifier')
|
37
|
-
|
38
|
-
def set_modifier(self,sm):
|
39
|
-
self.node.set('sentiment_modifier',sm)
|
40
|
-
|
41
|
-
def __str__(self):
|
42
|
-
return dump(self.node)
|
@@ -1,90 +0,0 @@
|
|
1
|
-
# included code for NAF/KAF
|
2
|
-
|
3
|
-
from lxml import etree
|
4
|
-
|
5
|
-
|
6
|
-
class Cwf:
|
7
|
-
def __init__(self,node=None,type='NAF'):
|
8
|
-
self.type = type
|
9
|
-
##self.id = '' self.sent = '' self.para = '' self.page = '' self.offset = '' self.lenght = '' s
|
10
|
-
if node is None:
|
11
|
-
self.node = etree.Element('wf')
|
12
|
-
else:
|
13
|
-
self.node = node
|
14
|
-
|
15
|
-
def get_node(self):
|
16
|
-
return self.node
|
17
|
-
|
18
|
-
def set_id(self,this_id):
|
19
|
-
if self.type == 'NAF':
|
20
|
-
return self.node.set('id',this_id)
|
21
|
-
elif self.type == 'KAF':
|
22
|
-
return self.node.set('wid',this_id)
|
23
|
-
|
24
|
-
def get_id(self):
|
25
|
-
if self.type == 'NAF':
|
26
|
-
return self.node.get('id')
|
27
|
-
elif self.type == 'KAF':
|
28
|
-
return self.node.get('wid')
|
29
|
-
|
30
|
-
def set_text(self,this_text):
|
31
|
-
self.node.text = this_text
|
32
|
-
|
33
|
-
def get_text(self):
|
34
|
-
return self.node.text
|
35
|
-
|
36
|
-
def set_sent(self,this_sent):
|
37
|
-
self.node.set('sent',this_sent)
|
38
|
-
|
39
|
-
def get_sent(self):
|
40
|
-
return self.node.get('sent')
|
41
|
-
|
42
|
-
|
43
|
-
class Ctext:
|
44
|
-
def __init__(self,node=None,type='NAF'):
|
45
|
-
self.idx = {}
|
46
|
-
self.type = type
|
47
|
-
if node is None:
|
48
|
-
self.node = etree.Element('text')
|
49
|
-
else:
|
50
|
-
self.node = node
|
51
|
-
for wf_node in self.__get_wf_nodes():
|
52
|
-
if self.type == 'NAF': label_id = 'id'
|
53
|
-
elif self.type == 'KAF': label_id = 'wid'
|
54
|
-
self.idx[wf_node.get(label_id)] = wf_node
|
55
|
-
|
56
|
-
def get_node(self):
|
57
|
-
return self.node
|
58
|
-
|
59
|
-
def to_kaf(self):
|
60
|
-
if self.type == 'NAF':
|
61
|
-
self.type = 'KAF'
|
62
|
-
for node in self.__get_wf_nodes():
|
63
|
-
node.set('wid',node.get('id'))
|
64
|
-
del node.attrib['id']
|
65
|
-
|
66
|
-
def to_naf(self):
|
67
|
-
if self.type == 'KAF':
|
68
|
-
self.type = 'NAF'
|
69
|
-
for node in self.__get_wf_nodes():
|
70
|
-
node.set('id',node.get('wid'))
|
71
|
-
del node.attrib['wid']
|
72
|
-
|
73
|
-
def __get_wf_nodes(self):
|
74
|
-
for wf_node in self.node.findall('wf'):
|
75
|
-
yield wf_node
|
76
|
-
|
77
|
-
def __iter__(self):
|
78
|
-
for wf_node in self.__get_wf_nodes():
|
79
|
-
yield Cwf(node=wf_node,type=self.type)
|
80
|
-
|
81
|
-
def get_wf(self,token_id):
|
82
|
-
wf_node = self.idx.get(token_id)
|
83
|
-
if wf_node is not None:
|
84
|
-
return Cwf(node=wf_node,type=self.type)
|
85
|
-
else:
|
86
|
-
return None
|
87
|
-
|
88
|
-
def add_wf(self,wf_obj):
|
89
|
-
self.node.append(wf_obj.get_node())
|
90
|
-
|
@@ -1,10 +0,0 @@
|
|
1
|
-
Metadata-Version: 1.0
|
2
|
-
Name: KafNafParser
|
3
|
-
Version: 1.2
|
4
|
-
Summary: Parser between KAF and NAF
|
5
|
-
Home-page: https://github.com/cltl/KafNafParserPy
|
6
|
-
Author: Ruben Izquierdo
|
7
|
-
Author-email: r.izquierdobevia@vu.nl
|
8
|
-
License: UNKNOWN
|
9
|
-
Description: UNKNOWN
|
10
|
-
Platform: UNKNOWN
|
@@ -1,22 +0,0 @@
|
|
1
|
-
KafNafParser/KafNafParserMod.py
|
2
|
-
KafNafParser/__init__.py
|
3
|
-
KafNafParser/constituency_data.py
|
4
|
-
KafNafParser/coreference_data.py
|
5
|
-
KafNafParser/dependency_data.py
|
6
|
-
KafNafParser/entity_data.py
|
7
|
-
KafNafParser/external_references_data.py
|
8
|
-
KafNafParser/features_data.py
|
9
|
-
KafNafParser/header_data.py
|
10
|
-
KafNafParser/opinion_data.py
|
11
|
-
KafNafParser/references_data.py
|
12
|
-
KafNafParser/span_data.py
|
13
|
-
KafNafParser/term_data.py
|
14
|
-
KafNafParser/term_sentiment_data.py
|
15
|
-
KafNafParser/text_data.py
|
16
|
-
KafNafParser.egg-info/PKG-INFO
|
17
|
-
KafNafParser.egg-info/SOURCES.txt
|
18
|
-
KafNafParser.egg-info/dependency_links.txt
|
19
|
-
KafNafParser.egg-info/top_level.txt
|
20
|
-
KafNafParser/feature_extractor/__init__.py
|
21
|
-
KafNafParser/feature_extractor/constituency.py
|
22
|
-
KafNafParser/feature_extractor/dependency.py
|
@@ -1 +0,0 @@
|
|
1
|
-
|
@@ -1,47 +0,0 @@
|
|
1
|
-
../KafNafParser/__init__.py
|
2
|
-
../KafNafParser/header_data.py
|
3
|
-
../KafNafParser/text_data.py
|
4
|
-
../KafNafParser/term_data.py
|
5
|
-
../KafNafParser/entity_data.py
|
6
|
-
../KafNafParser/features_data.py
|
7
|
-
../KafNafParser/opinion_data.py
|
8
|
-
../KafNafParser/constituency_data.py
|
9
|
-
../KafNafParser/dependency_data.py
|
10
|
-
../KafNafParser/coreference_data.py
|
11
|
-
../KafNafParser/references_data.py
|
12
|
-
../KafNafParser/external_references_data.py
|
13
|
-
../KafNafParser/span_data.py
|
14
|
-
../KafNafParser/KafNafParserMod.py
|
15
|
-
../KafNafParser/term_sentiment_data.py
|
16
|
-
../KafNafParser/feature_extractor/dependency.py
|
17
|
-
../KafNafParser/feature_extractor/constituency.py
|
18
|
-
../KafNafParser/feature_extractor/__init__.py
|
19
|
-
../KafNafParser/__init__.pyc
|
20
|
-
../KafNafParser/header_data.pyc
|
21
|
-
../KafNafParser/text_data.pyc
|
22
|
-
../KafNafParser/term_data.pyc
|
23
|
-
../KafNafParser/entity_data.pyc
|
24
|
-
../KafNafParser/features_data.pyc
|
25
|
-
../KafNafParser/opinion_data.pyc
|
26
|
-
../KafNafParser/constituency_data.pyc
|
27
|
-
../KafNafParser/dependency_data.pyc
|
28
|
-
../KafNafParser/coreference_data.pyc
|
29
|
-
../KafNafParser/references_data.pyc
|
30
|
-
../KafNafParser/external_references_data.pyc
|
31
|
-
../KafNafParser/span_data.pyc
|
32
|
-
../KafNafParser/KafNafParserMod.pyc
|
33
|
-
../KafNafParser/term_sentiment_data.pyc
|
34
|
-
../KafNafParser/feature_extractor/dependency.pyc
|
35
|
-
../KafNafParser/feature_extractor/constituency.pyc
|
36
|
-
../KafNafParser/feature_extractor/__init__.pyc
|
37
|
-
../../../kaf_example.xml
|
38
|
-
../../../naf.dtd
|
39
|
-
../../../naf_example.xml
|
40
|
-
../../../test.py
|
41
|
-
../../../README.md
|
42
|
-
../../../LICENSE
|
43
|
-
./
|
44
|
-
SOURCES.txt
|
45
|
-
dependency_links.txt
|
46
|
-
top_level.txt
|
47
|
-
PKG-INFO
|
@@ -1 +0,0 @@
|
|
1
|
-
KafNafParser
|
@@ -1 +0,0 @@
|
|
1
|
-
pass
|
@@ -1 +0,0 @@
|
|
1
|
-
from common import *
|
@@ -1,28 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
from operator import itemgetter
|
4
|
-
|
5
|
-
# Get the max (key,count) from a dict like my_dict = {'a':20,'b':1,'c':50}
|
6
|
-
# It will return --> (c,50)
|
7
|
-
def get_max_distr_dict(my_dict):
|
8
|
-
vect = my_dict.items()
|
9
|
-
if len(vect) !=0:
|
10
|
-
vect.sort(key=itemgetter(1),reverse=True)
|
11
|
-
return vect[0]
|
12
|
-
return None
|
13
|
-
|
14
|
-
def normalize_pos(pos):
|
15
|
-
pos = pos.lower()
|
16
|
-
new_pos = pos
|
17
|
-
if pos in ['adj','a'] or pos[0:2]=='jj':
|
18
|
-
new_pos = 'a'
|
19
|
-
elif pos in ['adverb','r'] or pos[0:2]=='rb':
|
20
|
-
new_pos = 'r'
|
21
|
-
elif pos in ['anypos']:
|
22
|
-
new_pos = '*'
|
23
|
-
elif pos in ['noun','n'] or pos[0:2]=='nn' or pos[0:2]=='np':
|
24
|
-
new_pos = 'n'
|
25
|
-
elif pos in ['verb','v'] or pos[0]=='v':
|
26
|
-
new_pos = 'v'
|
27
|
-
return new_pos
|
28
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
from google_web_nl import *
|
@@ -1,156 +0,0 @@
|
|
1
|
-
import urllib2
|
2
|
-
import urllib
|
3
|
-
import sys
|
4
|
-
import time
|
5
|
-
|
6
|
-
try:
|
7
|
-
from lxml import etree
|
8
|
-
except:
|
9
|
-
import xml.etree.cElementTree as etree
|
10
|
-
|
11
|
-
class Citem:
|
12
|
-
def __init__(self,item=None):
|
13
|
-
self.hits = None
|
14
|
-
self.word = None
|
15
|
-
self.tokens = None
|
16
|
-
if item is not None:
|
17
|
-
if isinstance(item,str):
|
18
|
-
self.load_from_string(item)
|
19
|
-
else:
|
20
|
-
self.load_from_item_node(item)
|
21
|
-
|
22
|
-
def load_from_string(self,line):
|
23
|
-
## Example line: 22865,"de server van"
|
24
|
-
line = line.strip()
|
25
|
-
pos = line.find(',')
|
26
|
-
self.hits = int(line[:pos])
|
27
|
-
self.word = line[pos+2:-1]
|
28
|
-
self.tokens = self.word.split(' ')
|
29
|
-
|
30
|
-
def load_from_item_node(self,item_node):
|
31
|
-
hits_node = item_node.find('hits')
|
32
|
-
if hits_node is not None:
|
33
|
-
self.hits = int(hits_node.text)
|
34
|
-
|
35
|
-
word_node = item_node.find('word')
|
36
|
-
if word_node is not None:
|
37
|
-
self.word = str(word_node.text)
|
38
|
-
self.tokens = self.word.split(' ')
|
39
|
-
|
40
|
-
def __str__(self):
|
41
|
-
if self.word is not None and self.hits is not None:
|
42
|
-
s = str(self.tokens)+' ->'+str(self.hits)+' hits'
|
43
|
-
else:
|
44
|
-
s = 'None'
|
45
|
-
return s
|
46
|
-
|
47
|
-
def __repr__(self):
|
48
|
-
return self.__str__()
|
49
|
-
|
50
|
-
def get_hits(self):
|
51
|
-
return self.hits
|
52
|
-
|
53
|
-
def get_word(self):
|
54
|
-
return self.word
|
55
|
-
|
56
|
-
def get_tokens(self):
|
57
|
-
return self.tokens
|
58
|
-
|
59
|
-
|
60
|
-
class Cgoogle_web_nl:
|
61
|
-
def __init__(self):
|
62
|
-
self.url='http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl'
|
63
|
-
self.sleep_this_time = 5 #First time to sleep in case of error
|
64
|
-
self.max_trials = 20
|
65
|
-
self.limit = 1000
|
66
|
-
self.min_freq = 100
|
67
|
-
self.items = []
|
68
|
-
|
69
|
-
|
70
|
-
def set_limit(self,l):
|
71
|
-
if not isinstance(l, int):
|
72
|
-
print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
|
73
|
-
sys.exit(-1)
|
74
|
-
self.limit = l
|
75
|
-
|
76
|
-
def set_min_freq(self,m):
|
77
|
-
if not isinstance(m, int):
|
78
|
-
print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
|
79
|
-
sys.exit(-1)
|
80
|
-
self.min_freq = m
|
81
|
-
|
82
|
-
def query(self,this_query,fixed='shown'):
|
83
|
-
#http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl?
|
84
|
-
#query=interessante%20*&
|
85
|
-
#mode=XML&limit=10000&
|
86
|
-
#threshold=40&optimize=on&wildcards=listed+normally
|
87
|
-
#&fixed=shown&.cgifields=debug&.cgifields=optimize
|
88
|
-
dict_params = {}
|
89
|
-
dict_params['query'] = this_query
|
90
|
-
dict_params['mode']='XML'
|
91
|
-
#dict_params['mode']='csv'
|
92
|
-
dict_params['limit']=self.limit
|
93
|
-
dict_params['threshold']=self.min_freq
|
94
|
-
dict_params['optimize']='on'
|
95
|
-
dict_params['wildcards']='listed normally'
|
96
|
-
dict_params['fixed']=fixed
|
97
|
-
dict_params['.cgifields']='debug'
|
98
|
-
dict_params['.cgifields']='optimize'
|
99
|
-
params = urllib.urlencode(dict_params)
|
100
|
-
#print>>sys.stderr,self.url+'?%s' % params
|
101
|
-
|
102
|
-
|
103
|
-
done = False
|
104
|
-
this_url = None
|
105
|
-
trials = 0
|
106
|
-
while not done:
|
107
|
-
try:
|
108
|
-
this_url = urllib2.urlopen(self.url+'?%s' % params)
|
109
|
-
code = this_url.getcode()
|
110
|
-
except Exception as e:
|
111
|
-
code = -1
|
112
|
-
print>>sys.stderr,str(e)
|
113
|
-
|
114
|
-
if code == 200:
|
115
|
-
done = True
|
116
|
-
else:
|
117
|
-
print>>sys.stderr,'Got an error (code '+str(code)+') querying google web nl, with "'+this_query+'", retrying...'
|
118
|
-
print>>sys.stderr,'Trial ',trials,' waiting ',self.sleep_this_time,'seconds'
|
119
|
-
time.sleep(self.sleep_this_time)
|
120
|
-
trials += 1
|
121
|
-
self.sleep_this_time += 1
|
122
|
-
if trials == self.max_trials:
|
123
|
-
print>>sys.stderr,'Maximum number of trials reached. Giving up...'
|
124
|
-
done = True
|
125
|
-
this_url = None
|
126
|
-
|
127
|
-
if this_url is not None:
|
128
|
-
if dict_params['mode'] == 'XML':
|
129
|
-
xml_obj = etree.parse(this_url)
|
130
|
-
this_url.close()
|
131
|
-
|
132
|
-
for item_node in xml_obj.findall('item'):
|
133
|
-
self.items.append(Citem(item_node))
|
134
|
-
del xml_obj
|
135
|
-
else: #CSV
|
136
|
-
first_line = True
|
137
|
-
## The first line is frequency,"N-gram"
|
138
|
-
for line in this_url:
|
139
|
-
if not first_line:
|
140
|
-
self.items.append(Citem(line))
|
141
|
-
first_line = False
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
def get_items(self):
|
146
|
-
for item in self.items:
|
147
|
-
yield item
|
148
|
-
|
149
|
-
def get_all_items(self):
|
150
|
-
return self.items
|
151
|
-
|
152
|
-
def len(self):
|
153
|
-
return len(self.items)
|
154
|
-
def __iter__(self):
|
155
|
-
for item in self.items:
|
156
|
-
yield item
|
@@ -1 +0,0 @@
|
|
1
|
-
from feature_file import *
|