opener-kaf-naf-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +674 -0
- data/README.md +46 -0
- data/bin/kaf-naf-parser +8 -0
- data/bin/kaf-naf-parser-server +10 -0
- data/bin/kaf-to-naf +7 -0
- data/bin/naf-to-kaf +7 -0
- data/config.ru +4 -0
- data/core/kaf-naf-parser.py +42 -0
- data/core/packages/KafNafParser-1.2.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.3.tar.gz +0 -0
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/SOURCES.txt +22 -0
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/installed-files.txt +47 -0
- data/core/site-packages/pre_build/KafNafParser-1.2-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +338 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +80 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +300 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.py +71 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +200 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.py +15 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.py +90 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/SOURCES.txt +14 -0
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/installed-files.txt +23 -0
- data/core/site-packages/pre_build/VUA_pylib-1.3-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/ext/hack/support.rb +38 -0
- data/lib/opener/kaf_naf_parser.rb +77 -0
- data/lib/opener/kaf_naf_parser/cli.rb +92 -0
- data/lib/opener/kaf_naf_parser/public/markdown.css +284 -0
- data/lib/opener/kaf_naf_parser/server.rb +16 -0
- data/lib/opener/kaf_naf_parser/version.rb +5 -0
- data/lib/opener/kaf_naf_parser/views/index.erb +103 -0
- data/lib/opener/kaf_naf_parser/views/result.erb +15 -0
- data/opener-kaf-naf-parser.gemspec +38 -0
- data/pre_build_requirements.txt +3 -0
- metadata +283 -0
|
Binary file
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# included code for NAF/KAF
|
|
2
|
+
|
|
3
|
+
from lxml import etree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Cwf:
|
|
7
|
+
def __init__(self,node=None,type='NAF'):
|
|
8
|
+
self.type = type
|
|
9
|
+
##self.id = '' self.sent = '' self.para = '' self.page = '' self.offset = '' self.lenght = '' s
|
|
10
|
+
if node is None:
|
|
11
|
+
self.node = etree.Element('wf')
|
|
12
|
+
else:
|
|
13
|
+
self.node = node
|
|
14
|
+
|
|
15
|
+
def get_node(self):
|
|
16
|
+
return self.node
|
|
17
|
+
|
|
18
|
+
def set_id(self,this_id):
|
|
19
|
+
if self.type == 'NAF':
|
|
20
|
+
return self.node.set('id',this_id)
|
|
21
|
+
elif self.type == 'KAF':
|
|
22
|
+
return self.node.set('wid',this_id)
|
|
23
|
+
|
|
24
|
+
def get_id(self):
|
|
25
|
+
if self.type == 'NAF':
|
|
26
|
+
return self.node.get('id')
|
|
27
|
+
elif self.type == 'KAF':
|
|
28
|
+
return self.node.get('wid')
|
|
29
|
+
|
|
30
|
+
def set_text(self,this_text):
|
|
31
|
+
self.node.text = this_text
|
|
32
|
+
|
|
33
|
+
def get_text(self):
|
|
34
|
+
return self.node.text
|
|
35
|
+
|
|
36
|
+
def set_sent(self,this_sent):
|
|
37
|
+
self.node.set('sent',this_sent)
|
|
38
|
+
|
|
39
|
+
def get_sent(self):
|
|
40
|
+
return self.node.get('sent')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Ctext:
|
|
44
|
+
def __init__(self,node=None,type='NAF'):
|
|
45
|
+
self.idx = {}
|
|
46
|
+
self.type = type
|
|
47
|
+
if node is None:
|
|
48
|
+
self.node = etree.Element('text')
|
|
49
|
+
else:
|
|
50
|
+
self.node = node
|
|
51
|
+
for wf_node in self.__get_wf_nodes():
|
|
52
|
+
if self.type == 'NAF': label_id = 'id'
|
|
53
|
+
elif self.type == 'KAF': label_id = 'wid'
|
|
54
|
+
self.idx[wf_node.get(label_id)] = wf_node
|
|
55
|
+
|
|
56
|
+
def get_node(self):
|
|
57
|
+
return self.node
|
|
58
|
+
|
|
59
|
+
def to_kaf(self):
|
|
60
|
+
if self.type == 'NAF':
|
|
61
|
+
self.type = 'KAF'
|
|
62
|
+
for node in self.__get_wf_nodes():
|
|
63
|
+
node.set('wid',node.get('id'))
|
|
64
|
+
del node.attrib['id']
|
|
65
|
+
|
|
66
|
+
def to_naf(self):
|
|
67
|
+
if self.type == 'KAF':
|
|
68
|
+
self.type = 'NAF'
|
|
69
|
+
for node in self.__get_wf_nodes():
|
|
70
|
+
node.set('id',node.get('wid'))
|
|
71
|
+
del node.attrib['wid']
|
|
72
|
+
|
|
73
|
+
def __get_wf_nodes(self):
|
|
74
|
+
for wf_node in self.node.findall('wf'):
|
|
75
|
+
yield wf_node
|
|
76
|
+
|
|
77
|
+
def __iter__(self):
|
|
78
|
+
for wf_node in self.__get_wf_nodes():
|
|
79
|
+
yield Cwf(node=wf_node,type=self.type)
|
|
80
|
+
|
|
81
|
+
def get_wf(self,token_id):
|
|
82
|
+
wf_node = self.idx.get(token_id)
|
|
83
|
+
if wf_node is not None:
|
|
84
|
+
return Cwf(node=wf_node,type=self.type)
|
|
85
|
+
else:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
def add_wf(self,wf_obj):
|
|
89
|
+
self.node.append(wf_obj.get_node())
|
|
90
|
+
|
|
Binary file
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 1.0
|
|
2
|
+
Name: VUA-pylib
|
|
3
|
+
Version: 1.3
|
|
4
|
+
Summary: Various KAF / NAF python helpers
|
|
5
|
+
Home-page: https://github.com/cltl/VUA_pylib
|
|
6
|
+
Author: Ruben Izquierdo
|
|
7
|
+
Author-email: r.izquierdobevia@vu.nl
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Description: UNKNOWN
|
|
10
|
+
Platform: UNKNOWN
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README
|
|
2
|
+
VUA_pylib/__init__.py
|
|
3
|
+
VUA_pylib.egg-info/PKG-INFO
|
|
4
|
+
VUA_pylib.egg-info/SOURCES.txt
|
|
5
|
+
VUA_pylib.egg-info/dependency_links.txt
|
|
6
|
+
VUA_pylib.egg-info/top_level.txt
|
|
7
|
+
VUA_pylib/common/__init__.py
|
|
8
|
+
VUA_pylib/common/common.py
|
|
9
|
+
VUA_pylib/corpus_reader/__init__.py
|
|
10
|
+
VUA_pylib/corpus_reader/google_web_nl.py
|
|
11
|
+
VUA_pylib/io_utils/__init__.py
|
|
12
|
+
VUA_pylib/io_utils/feature_file.py
|
|
13
|
+
VUA_pylib/lexicon/__init__.py
|
|
14
|
+
VUA_pylib/lexicon/lexicon.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
../VUA_pylib/__init__.py
|
|
2
|
+
../VUA_pylib/lexicon/lexicon.py
|
|
3
|
+
../VUA_pylib/lexicon/__init__.py
|
|
4
|
+
../VUA_pylib/common/common.py
|
|
5
|
+
../VUA_pylib/common/__init__.py
|
|
6
|
+
../VUA_pylib/io_utils/feature_file.py
|
|
7
|
+
../VUA_pylib/io_utils/__init__.py
|
|
8
|
+
../VUA_pylib/corpus_reader/google_web_nl.py
|
|
9
|
+
../VUA_pylib/corpus_reader/__init__.py
|
|
10
|
+
../VUA_pylib/__init__.pyc
|
|
11
|
+
../VUA_pylib/lexicon/lexicon.pyc
|
|
12
|
+
../VUA_pylib/lexicon/__init__.pyc
|
|
13
|
+
../VUA_pylib/common/common.pyc
|
|
14
|
+
../VUA_pylib/common/__init__.pyc
|
|
15
|
+
../VUA_pylib/io_utils/feature_file.pyc
|
|
16
|
+
../VUA_pylib/io_utils/__init__.pyc
|
|
17
|
+
../VUA_pylib/corpus_reader/google_web_nl.pyc
|
|
18
|
+
../VUA_pylib/corpus_reader/__init__.pyc
|
|
19
|
+
./
|
|
20
|
+
SOURCES.txt
|
|
21
|
+
dependency_links.txt
|
|
22
|
+
top_level.txt
|
|
23
|
+
PKG-INFO
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
VUA_pylib
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pass
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from common import *
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
from operator import itemgetter
|
|
4
|
+
|
|
5
|
+
# Get the max (key,count) from a dict like my_dict = {'a':20,'b':1,'c':50}
|
|
6
|
+
# It will return --> (c,50)
|
|
7
|
+
def get_max_distr_dict(my_dict):
|
|
8
|
+
vect = my_dict.items()
|
|
9
|
+
if len(vect) !=0:
|
|
10
|
+
vect.sort(key=itemgetter(1),reverse=True)
|
|
11
|
+
return vect[0]
|
|
12
|
+
return None
|
|
13
|
+
|
|
14
|
+
def normalize_pos(pos):
|
|
15
|
+
pos = pos.lower()
|
|
16
|
+
new_pos = pos
|
|
17
|
+
if pos in ['adj','a'] or pos[0:2]=='jj':
|
|
18
|
+
new_pos = 'a'
|
|
19
|
+
elif pos in ['adverb','r'] or pos[0:2]=='rb':
|
|
20
|
+
new_pos = 'r'
|
|
21
|
+
elif pos in ['anypos']:
|
|
22
|
+
new_pos = '*'
|
|
23
|
+
elif pos in ['noun','n'] or pos[0:2]=='nn' or pos[0:2]=='np':
|
|
24
|
+
new_pos = 'n'
|
|
25
|
+
elif pos in ['verb','v'] or pos[0]=='v':
|
|
26
|
+
new_pos = 'v'
|
|
27
|
+
return new_pos
|
|
28
|
+
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from google_web_nl import *
|
|
Binary file
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import urllib2
|
|
2
|
+
import urllib
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from lxml import etree
|
|
8
|
+
except:
|
|
9
|
+
import xml.etree.cElementTree as etree
|
|
10
|
+
|
|
11
|
+
class Citem:
|
|
12
|
+
def __init__(self,item=None):
|
|
13
|
+
self.hits = None
|
|
14
|
+
self.word = None
|
|
15
|
+
self.tokens = None
|
|
16
|
+
if item is not None:
|
|
17
|
+
if isinstance(item,str):
|
|
18
|
+
self.load_from_string(item)
|
|
19
|
+
else:
|
|
20
|
+
self.load_from_item_node(item)
|
|
21
|
+
|
|
22
|
+
def load_from_string(self,line):
|
|
23
|
+
## Example line: 22865,"de server van"
|
|
24
|
+
line = line.strip()
|
|
25
|
+
pos = line.find(',')
|
|
26
|
+
self.hits = int(line[:pos])
|
|
27
|
+
self.word = line[pos+2:-1]
|
|
28
|
+
self.tokens = self.word.split(' ')
|
|
29
|
+
|
|
30
|
+
def load_from_item_node(self,item_node):
|
|
31
|
+
hits_node = item_node.find('hits')
|
|
32
|
+
if hits_node is not None:
|
|
33
|
+
self.hits = int(hits_node.text)
|
|
34
|
+
|
|
35
|
+
word_node = item_node.find('word')
|
|
36
|
+
if word_node is not None:
|
|
37
|
+
self.word = str(word_node.text)
|
|
38
|
+
self.tokens = self.word.split(' ')
|
|
39
|
+
|
|
40
|
+
def __str__(self):
|
|
41
|
+
if self.word is not None and self.hits is not None:
|
|
42
|
+
s = str(self.tokens)+' ->'+str(self.hits)+' hits'
|
|
43
|
+
else:
|
|
44
|
+
s = 'None'
|
|
45
|
+
return s
|
|
46
|
+
|
|
47
|
+
def __repr__(self):
|
|
48
|
+
return self.__str__()
|
|
49
|
+
|
|
50
|
+
def get_hits(self):
|
|
51
|
+
return self.hits
|
|
52
|
+
|
|
53
|
+
def get_word(self):
|
|
54
|
+
return self.word
|
|
55
|
+
|
|
56
|
+
def get_tokens(self):
|
|
57
|
+
return self.tokens
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Cgoogle_web_nl:
|
|
61
|
+
def __init__(self):
|
|
62
|
+
self.url='http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl'
|
|
63
|
+
self.sleep_this_time = 5 #First time to sleep in case of error
|
|
64
|
+
self.max_trials = 20
|
|
65
|
+
self.limit = 1000
|
|
66
|
+
self.min_freq = 100
|
|
67
|
+
self.items = []
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def set_limit(self,l):
|
|
71
|
+
if not isinstance(l, int):
|
|
72
|
+
print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
|
|
73
|
+
sys.exit(-1)
|
|
74
|
+
self.limit = l
|
|
75
|
+
|
|
76
|
+
def set_min_freq(self,m):
|
|
77
|
+
if not isinstance(m, int):
|
|
78
|
+
print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
|
|
79
|
+
sys.exit(-1)
|
|
80
|
+
self.min_freq = m
|
|
81
|
+
|
|
82
|
+
def query(self,this_query,fixed='shown'):
|
|
83
|
+
#http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl?
|
|
84
|
+
#query=interessante%20*&
|
|
85
|
+
#mode=XML&limit=10000&
|
|
86
|
+
#threshold=40&optimize=on&wildcards=listed+normally
|
|
87
|
+
#&fixed=shown&.cgifields=debug&.cgifields=optimize
|
|
88
|
+
dict_params = {}
|
|
89
|
+
dict_params['query'] = this_query
|
|
90
|
+
dict_params['mode']='XML'
|
|
91
|
+
#dict_params['mode']='csv'
|
|
92
|
+
dict_params['limit']=self.limit
|
|
93
|
+
dict_params['threshold']=self.min_freq
|
|
94
|
+
dict_params['optimize']='on'
|
|
95
|
+
dict_params['wildcards']='listed normally'
|
|
96
|
+
dict_params['fixed']=fixed
|
|
97
|
+
dict_params['.cgifields']='debug'
|
|
98
|
+
dict_params['.cgifields']='optimize'
|
|
99
|
+
params = urllib.urlencode(dict_params)
|
|
100
|
+
#print>>sys.stderr,self.url+'?%s' % params
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
done = False
|
|
104
|
+
this_url = None
|
|
105
|
+
trials = 0
|
|
106
|
+
while not done:
|
|
107
|
+
try:
|
|
108
|
+
this_url = urllib2.urlopen(self.url+'?%s' % params)
|
|
109
|
+
code = this_url.getcode()
|
|
110
|
+
except Exception as e:
|
|
111
|
+
code = -1
|
|
112
|
+
print>>sys.stderr,str(e)
|
|
113
|
+
|
|
114
|
+
if code == 200:
|
|
115
|
+
done = True
|
|
116
|
+
else:
|
|
117
|
+
print>>sys.stderr,'Got an error (code '+str(code)+') querying google web nl, with "'+this_query+'", retrying...'
|
|
118
|
+
print>>sys.stderr,'Trial ',trials,' waiting ',self.sleep_this_time,'seconds'
|
|
119
|
+
time.sleep(self.sleep_this_time)
|
|
120
|
+
trials += 1
|
|
121
|
+
self.sleep_this_time += 1
|
|
122
|
+
if trials == self.max_trials:
|
|
123
|
+
print>>sys.stderr,'Maximum number of trials reached. Giving up...'
|
|
124
|
+
done = True
|
|
125
|
+
this_url = None
|
|
126
|
+
|
|
127
|
+
if this_url is not None:
|
|
128
|
+
if dict_params['mode'] == 'XML':
|
|
129
|
+
xml_obj = etree.parse(this_url)
|
|
130
|
+
this_url.close()
|
|
131
|
+
|
|
132
|
+
for item_node in xml_obj.findall('item'):
|
|
133
|
+
self.items.append(Citem(item_node))
|
|
134
|
+
del xml_obj
|
|
135
|
+
else: #CSV
|
|
136
|
+
first_line = True
|
|
137
|
+
## The first line is frequency,"N-gram"
|
|
138
|
+
for line in this_url:
|
|
139
|
+
if not first_line:
|
|
140
|
+
self.items.append(Citem(line))
|
|
141
|
+
first_line = False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_items(self):
|
|
146
|
+
for item in self.items:
|
|
147
|
+
yield item
|
|
148
|
+
|
|
149
|
+
def get_all_items(self):
|
|
150
|
+
return self.items
|
|
151
|
+
|
|
152
|
+
def len(self):
|
|
153
|
+
return len(self.items)
|
|
154
|
+
def __iter__(self):
|
|
155
|
+
for item in self.items:
|
|
156
|
+
yield item
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from feature_file import *
|
|
Binary file
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from operator import itemgetter
|
|
2
|
+
import sys
|
|
3
|
+
import cPickle
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Cexample:
|
|
8
|
+
def __init__(self,str_line=None):
|
|
9
|
+
self.label = ''
|
|
10
|
+
self.features = []
|
|
11
|
+
if str_line is not None:
|
|
12
|
+
self.load_from_line(str_line)
|
|
13
|
+
|
|
14
|
+
def load_from_line(self,str_line):
|
|
15
|
+
fields = str_line.strip().split('\t')
|
|
16
|
+
self.label = fields[0]
|
|
17
|
+
for feat in fields[1:]:
|
|
18
|
+
first_equal = feat.find('=')
|
|
19
|
+
if first_equal != -1:
|
|
20
|
+
name = feat[:first_equal]
|
|
21
|
+
value = feat[first_equal+1:]
|
|
22
|
+
self.features.append((name,value))
|
|
23
|
+
|
|
24
|
+
def __str__(self):
|
|
25
|
+
s = 'Label: '+self.label+'\n'
|
|
26
|
+
s += 'Feats: '+str(self.features)
|
|
27
|
+
return s
|
|
28
|
+
|
|
29
|
+
def get_label(self):
|
|
30
|
+
return self.label
|
|
31
|
+
|
|
32
|
+
def get_features(self):
|
|
33
|
+
for name,value in self.features:
|
|
34
|
+
yield name,value
|
|
35
|
+
|
|
36
|
+
def get_all_features(self):
|
|
37
|
+
return self.features
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Cfeature_index:
|
|
41
|
+
def __init__(self):
|
|
42
|
+
self.idx = {}
|
|
43
|
+
|
|
44
|
+
def get_number_feat(self,feat):
|
|
45
|
+
return self.idx.get(feat,None)
|
|
46
|
+
|
|
47
|
+
def add_feat(self,feat):
|
|
48
|
+
num_feat = len(self.idx)+1
|
|
49
|
+
self.idx[feat] = num_feat
|
|
50
|
+
return num_feat
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compose_feat(self,name,value):
|
|
54
|
+
return name+'###'+value
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def __encode_features(self,feats,modify_index=True):
|
|
58
|
+
feats_for_example = {}
|
|
59
|
+
clean_feats = ''
|
|
60
|
+
for name, value in feats:
|
|
61
|
+
my_feat = self.compose_feat(name, value)
|
|
62
|
+
clean_feats+=my_feat+' '
|
|
63
|
+
num_feat = self.get_number_feat(my_feat)
|
|
64
|
+
if num_feat is None:
|
|
65
|
+
if modify_index:
|
|
66
|
+
num_feat = self.add_feat(my_feat)
|
|
67
|
+
|
|
68
|
+
if num_feat is not None:
|
|
69
|
+
if num_feat in feats_for_example:
|
|
70
|
+
feats_for_example[num_feat] += 1
|
|
71
|
+
else:
|
|
72
|
+
feats_for_example[num_feat] = 1
|
|
73
|
+
return sorted(feats_for_example.items(),key=itemgetter(0)),clean_feats
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def encode_feature_file_to_svm(self,feat_file_obj,out_fic=sys.stdout):
|
|
77
|
+
for example in feat_file_obj:
|
|
78
|
+
class_label = example.get_label()
|
|
79
|
+
out_fic.write(class_label)
|
|
80
|
+
feats_for_example, clean_feats =self.__encode_features(example.get_all_features())
|
|
81
|
+
|
|
82
|
+
for feat,freq_feat in feats_for_example:
|
|
83
|
+
value = freq_feat
|
|
84
|
+
out_fic.write(' %d:%d' % (feat,value))
|
|
85
|
+
out_fic.write(' #'+clean_feats+'\n')
|
|
86
|
+
|
|
87
|
+
def encode_example_for_classification(self, feats,out_fic,my_class='0'):
|
|
88
|
+
feats_for_example, clean_feats =self.__encode_features(feats,modify_index=False)
|
|
89
|
+
out_fic.write(my_class)
|
|
90
|
+
for feat,freq_feat in feats_for_example:
|
|
91
|
+
value = freq_feat
|
|
92
|
+
out_fic.write(' %d:%d' % (feat,value))
|
|
93
|
+
out_fic.write(' #'+clean_feats.encode('utf-8')+'\n')
|
|
94
|
+
|
|
95
|
+
def save_to_file(self,filename):
|
|
96
|
+
fic = open(filename,'wb')
|
|
97
|
+
cPickle.dump(self.idx, fic, protocol=0)
|
|
98
|
+
fic.close()
|
|
99
|
+
|
|
100
|
+
def load_from_file(self,filename):
|
|
101
|
+
fic = open(filename,'rb')
|
|
102
|
+
self.idx = cPickle.load(fic)
|
|
103
|
+
fic.close()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class Cfeature_file:
|
|
108
|
+
def __init__(self,filename=None):
|
|
109
|
+
self.filename = filename
|
|
110
|
+
|
|
111
|
+
def __iter__(self):
|
|
112
|
+
if self.filename is not None:
|
|
113
|
+
fic = open(self.filename,'r')
|
|
114
|
+
for line in fic:
|
|
115
|
+
if line[0] != '#':
|
|
116
|
+
yield Cexample(line)
|
|
117
|
+
fic.close()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|