simple_bioc 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +8 -0
- data/lib/simple_bioc/annotation.rb +11 -0
- data/lib/simple_bioc/bioc_reader.rb +102 -0
- data/lib/simple_bioc/bioc_writer.rb +93 -0
- data/lib/simple_bioc/collection.rb +15 -0
- data/lib/simple_bioc/document.rb +29 -0
- data/lib/simple_bioc/location.rb +10 -0
- data/lib/simple_bioc/node.rb +12 -0
- data/lib/simple_bioc/node_base.rb +14 -0
- data/lib/simple_bioc/passage.rb +29 -0
- data/lib/simple_bioc/relation.rb +14 -0
- data/lib/simple_bioc/sentence.rb +20 -0
- data/lib/simple_bioc/version.rb +3 -0
- data/lib/simple_bioc.rb +14 -0
- data/simple_bioc.gemspec +27 -0
- data/spec/simple_bioc_spec.rb +14 -0
- data/xml/BioC.dtd +146 -0
- data/xml/PMID-8557975-simplified-sentences-tokens.xml +492 -0
- data/xml/PMID-8557975-simplified-sentences.xml +49 -0
- data/xml/abbr.key +71 -0
- data/xml/abbr.xml +1 -0
- data/xml/ascii.key +29 -0
- data/xml/ascii.xml +3 -0
- data/xml/everything-sentence.xml +1 -0
- data/xml/everything.key +8 -0
- data/xml/everything.xml +1 -0
- data/xml/lemma.key +51 -0
- data/xml/lemma.xml +1 -0
- data/xml/pos.key +49 -0
- data/xml/pos.xml +1 -0
- data/xml/sentence.key +36 -0
- data/xml/sentence.xml +1 -0
- metadata +153 -0
data/xml/pos.key
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
pos key
|
2
|
+
|
3
|
+
collection: 10 random PubMed documents with ASCII text split into
|
4
|
+
sentences and tokens by the MedPost tokenizer
|
5
|
+
|
6
|
+
Original source sentence.xml
|
7
|
+
|
8
|
+
source: PubMed
|
9
|
+
|
10
|
+
date: yyyymmdd. Date documents downloaded from PubMed
|
11
|
+
|
12
|
+
document: Title and possibly abstract from a PubMed reference
|
13
|
+
|
14
|
+
id: PubMed id
|
15
|
+
|
16
|
+
passage: Either title or abstract
|
17
|
+
|
18
|
+
infon["type"]: "title" or "abstract"
|
19
|
+
|
20
|
+
offset: The original Unicode byte offsets were not updated after
|
21
|
+
the ASCII conversion.
|
22
|
+
|
23
|
+
PubMed is extracted from an XML file, so literal offsets
|
24
|
+
would not be useful. Title has an offset of zero, while
|
25
|
+
the abstract is assumed to begin after the title and one
|
26
|
+
space. These offsets at least sequence the abstract after
|
27
|
+
the title.
|
28
|
+
|
29
|
+
sentence: One sentence of the passage as determined by the
|
30
|
+
MedPost sentence splitter
|
31
|
+
|
32
|
+
offset: A document offset to where the sentence begins in the
|
33
|
+
passage. Sum of the passage offset and the local offset
|
34
|
+
within the passage.
|
35
|
+
|
36
|
+
annotation: tokens in the sentence with their part-of-speech.
|
37
|
+
the annotations are of "type" "token"
|
38
|
+
|
39
|
+
infon["POS"]: The Penn Treebank part of speech tag as determined
|
40
|
+
by the MedPost biomedical part-of-speech tagger
|
41
|
+
|
42
|
+
location: offset: A document offset to where the annotated text
|
43
|
+
begins in the sentence. Sum of the sentence
|
44
|
+
offset and the local offset within the
|
45
|
+
sentence.
|
46
|
+
|
47
|
+
length: The length of the token.
|
48
|
+
|
49
|
+
text: ASCII text of the token.
|