bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/README
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
Released packages can be downloaded from SourceForge.net:
|
2
|
+
|
3
|
+
http://sourceforge.net/projects/bio-bwa/files/
|
4
|
+
|
5
|
+
Introduction and FAQ are available at:
|
6
|
+
|
7
|
+
http://bio-bwa.sourceforge.net
|
8
|
+
|
9
|
+
Manual page at:
|
10
|
+
|
11
|
+
http://bio-bwa.sourceforge.net/bwa.shtml
|
12
|
+
|
13
|
+
Mailing list:
|
14
|
+
|
15
|
+
bio-bwa-help@lists.sourceforge.net
|
16
|
+
|
17
|
+
To sign up:
|
18
|
+
|
19
|
+
http://sourceforge.net/mail/?group_id=276243
|
20
|
+
|
21
|
+
Publications (Open Access):
|
22
|
+
|
23
|
+
http://www.ncbi.nlm.nih.gov/pubmed/20080505
|
24
|
+
http://www.ncbi.nlm.nih.gov/pubmed/19451168
|
25
|
+
|
26
|
+
Incomplete list of citations (via HubMed.org):
|
27
|
+
|
28
|
+
http://www.hubmed.org/references.cgi?uids=20080505
|
29
|
+
http://www.hubmed.org/references.cgi?uids=19451168
|
data/ext/bamlite.c
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include "bamlite.h"
|
6
|
+
|
7
|
+
/*********************
|
8
|
+
* from bam_endian.c *
|
9
|
+
*********************/
|
10
|
+
|
11
|
+
static inline int bam_is_big_endian()
|
12
|
+
{
|
13
|
+
long one= 1;
|
14
|
+
return !(*((char *)(&one)));
|
15
|
+
}
|
16
|
+
static inline uint16_t bam_swap_endian_2(uint16_t v)
|
17
|
+
{
|
18
|
+
return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
|
19
|
+
}
|
20
|
+
static inline void *bam_swap_endian_2p(void *x)
|
21
|
+
{
|
22
|
+
*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
|
23
|
+
return x;
|
24
|
+
}
|
25
|
+
static inline uint32_t bam_swap_endian_4(uint32_t v)
|
26
|
+
{
|
27
|
+
v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
|
28
|
+
return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
|
29
|
+
}
|
30
|
+
static inline void *bam_swap_endian_4p(void *x)
|
31
|
+
{
|
32
|
+
*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
|
33
|
+
return x;
|
34
|
+
}
|
35
|
+
static inline uint64_t bam_swap_endian_8(uint64_t v)
|
36
|
+
{
|
37
|
+
v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
|
38
|
+
v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
|
39
|
+
return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
|
40
|
+
}
|
41
|
+
static inline void *bam_swap_endian_8p(void *x)
|
42
|
+
{
|
43
|
+
*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
|
44
|
+
return x;
|
45
|
+
}
|
46
|
+
|
47
|
+
/**************
|
48
|
+
* from bam.c *
|
49
|
+
**************/
|
50
|
+
|
51
|
+
int bam_is_be;
|
52
|
+
|
53
|
+
bam_header_t *bam_header_init()
|
54
|
+
{
|
55
|
+
bam_is_be = bam_is_big_endian();
|
56
|
+
return (bam_header_t*)calloc(1, sizeof(bam_header_t));
|
57
|
+
}
|
58
|
+
|
59
|
+
void bam_header_destroy(bam_header_t *header)
|
60
|
+
{
|
61
|
+
int32_t i;
|
62
|
+
if (header == 0) return;
|
63
|
+
if (header->target_name) {
|
64
|
+
for (i = 0; i < header->n_targets; ++i)
|
65
|
+
free(header->target_name[i]);
|
66
|
+
free(header->target_name);
|
67
|
+
free(header->target_len);
|
68
|
+
}
|
69
|
+
free(header->text);
|
70
|
+
free(header);
|
71
|
+
}
|
72
|
+
|
73
|
+
bam_header_t *bam_header_read(bamFile fp)
|
74
|
+
{
|
75
|
+
bam_header_t *header;
|
76
|
+
char buf[4];
|
77
|
+
int magic_len;
|
78
|
+
int32_t i = 1, name_len;
|
79
|
+
// read "BAM1"
|
80
|
+
magic_len = bam_read(fp, buf, 4);
|
81
|
+
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
|
82
|
+
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
|
83
|
+
return 0;
|
84
|
+
}
|
85
|
+
header = bam_header_init();
|
86
|
+
// read plain text and the number of reference sequences
|
87
|
+
bam_read(fp, &header->l_text, 4);
|
88
|
+
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
|
89
|
+
header->text = (char*)calloc(header->l_text + 1, 1);
|
90
|
+
bam_read(fp, header->text, header->l_text);
|
91
|
+
bam_read(fp, &header->n_targets, 4);
|
92
|
+
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
|
93
|
+
// read reference sequence names and lengths
|
94
|
+
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
|
95
|
+
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
|
96
|
+
for (i = 0; i != header->n_targets; ++i) {
|
97
|
+
bam_read(fp, &name_len, 4);
|
98
|
+
if (bam_is_be) bam_swap_endian_4p(&name_len);
|
99
|
+
header->target_name[i] = (char*)calloc(name_len, 1);
|
100
|
+
bam_read(fp, header->target_name[i], name_len);
|
101
|
+
bam_read(fp, &header->target_len[i], 4);
|
102
|
+
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
|
103
|
+
}
|
104
|
+
return header;
|
105
|
+
}
|
106
|
+
|
107
|
+
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
|
108
|
+
{
|
109
|
+
uint8_t *s;
|
110
|
+
uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
|
111
|
+
s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
|
112
|
+
for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
|
113
|
+
while (s < data + data_len) {
|
114
|
+
uint8_t type;
|
115
|
+
s += 2; // skip key
|
116
|
+
type = toupper(*s); ++s; // skip type
|
117
|
+
if (type == 'C' || type == 'A') ++s;
|
118
|
+
else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
|
119
|
+
else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
|
120
|
+
else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
|
121
|
+
else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
int bam_read1(bamFile fp, bam1_t *b)
|
126
|
+
{
|
127
|
+
bam1_core_t *c = &b->core;
|
128
|
+
int32_t block_len, ret, i;
|
129
|
+
uint32_t x[8];
|
130
|
+
|
131
|
+
if ((ret = bam_read(fp, &block_len, 4)) != 4) {
|
132
|
+
if (ret == 0) return -1; // normal end-of-file
|
133
|
+
else return -2; // truncated
|
134
|
+
}
|
135
|
+
if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
|
136
|
+
if (bam_is_be) {
|
137
|
+
bam_swap_endian_4p(&block_len);
|
138
|
+
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
|
139
|
+
}
|
140
|
+
c->tid = x[0]; c->pos = x[1];
|
141
|
+
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
|
142
|
+
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
|
143
|
+
c->l_qseq = x[4];
|
144
|
+
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
|
145
|
+
b->data_len = block_len - sizeof(bam1_core_t);
|
146
|
+
if (b->m_data < b->data_len) {
|
147
|
+
b->m_data = b->data_len;
|
148
|
+
kroundup32(b->m_data);
|
149
|
+
b->data = (uint8_t*)realloc(b->data, b->m_data);
|
150
|
+
}
|
151
|
+
if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
|
152
|
+
b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
|
153
|
+
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
|
154
|
+
return 4 + block_len;
|
155
|
+
}
|
data/ext/bamlite.h
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#ifndef BAMLITE_H_
|
2
|
+
#define BAMLITE_H_
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <zlib.h>
|
6
|
+
|
7
|
+
typedef gzFile bamFile;
|
8
|
+
#define bam_open(fn, mode) gzopen(fn, mode)
|
9
|
+
#define bam_dopen(fd, mode) gzdopen(fd, mode)
|
10
|
+
#define bam_close(fp) gzclose(fp)
|
11
|
+
#define bam_read(fp, buf, size) gzread(fp, buf, size)
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
int32_t n_targets;
|
15
|
+
char **target_name;
|
16
|
+
uint32_t *target_len;
|
17
|
+
size_t l_text, n_text;
|
18
|
+
char *text;
|
19
|
+
} bam_header_t;
|
20
|
+
|
21
|
+
#define BAM_FPAIRED 1
|
22
|
+
#define BAM_FPROPER_PAIR 2
|
23
|
+
#define BAM_FUNMAP 4
|
24
|
+
#define BAM_FMUNMAP 8
|
25
|
+
#define BAM_FREVERSE 16
|
26
|
+
#define BAM_FMREVERSE 32
|
27
|
+
#define BAM_FREAD1 64
|
28
|
+
#define BAM_FREAD2 128
|
29
|
+
#define BAM_FSECONDARY 256
|
30
|
+
#define BAM_FQCFAIL 512
|
31
|
+
#define BAM_FDUP 1024
|
32
|
+
|
33
|
+
#define BAM_CIGAR_SHIFT 4
|
34
|
+
#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
|
35
|
+
|
36
|
+
#define BAM_CMATCH 0
|
37
|
+
#define BAM_CINS 1
|
38
|
+
#define BAM_CDEL 2
|
39
|
+
#define BAM_CREF_SKIP 3
|
40
|
+
#define BAM_CSOFT_CLIP 4
|
41
|
+
#define BAM_CHARD_CLIP 5
|
42
|
+
#define BAM_CPAD 6
|
43
|
+
|
44
|
+
typedef struct {
|
45
|
+
int32_t tid;
|
46
|
+
int32_t pos;
|
47
|
+
uint32_t bin:16, qual:8, l_qname:8;
|
48
|
+
uint32_t flag:16, n_cigar:16;
|
49
|
+
int32_t l_qseq;
|
50
|
+
int32_t mtid;
|
51
|
+
int32_t mpos;
|
52
|
+
int32_t isize;
|
53
|
+
} bam1_core_t;
|
54
|
+
|
55
|
+
typedef struct {
|
56
|
+
bam1_core_t core;
|
57
|
+
int l_aux, data_len, m_data;
|
58
|
+
uint8_t *data;
|
59
|
+
} bam1_t;
|
60
|
+
|
61
|
+
#ifndef kroundup32
|
62
|
+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
63
|
+
#endif
|
64
|
+
|
65
|
+
#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
|
66
|
+
#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
|
67
|
+
#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
|
68
|
+
#define bam1_qname(b) ((char*)((b)->data))
|
69
|
+
#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
|
70
|
+
#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
|
71
|
+
#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
|
72
|
+
#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
|
73
|
+
|
74
|
+
#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
|
75
|
+
#define bam_destroy1(b) do { \
|
76
|
+
if (b) { free((b)->data); free(b); } \
|
77
|
+
} while (0)
|
78
|
+
|
79
|
+
extern int bam_is_be;
|
80
|
+
|
81
|
+
#ifdef __cplusplus
|
82
|
+
extern "C" {
|
83
|
+
#endif
|
84
|
+
|
85
|
+
bam_header_t *bam_header_init(void);
|
86
|
+
void bam_header_destroy(bam_header_t *header);
|
87
|
+
bam_header_t *bam_header_read(bamFile fp);
|
88
|
+
int bam_read1(bamFile fp, bam1_t *b);
|
89
|
+
|
90
|
+
#ifdef __cplusplus
|
91
|
+
}
|
92
|
+
#endif
|
93
|
+
|
94
|
+
#endif
|
data/ext/bntseq.c
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#include <stdio.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <string.h>
|
31
|
+
#include <zlib.h>
|
32
|
+
#include "bntseq.h"
|
33
|
+
#include "main.h"
|
34
|
+
#include "utils.h"
|
35
|
+
|
36
|
+
#include "kseq.h"
|
37
|
+
KSEQ_INIT(gzFile, gzread)
|
38
|
+
|
39
|
+
unsigned char nst_nt4_table[256] = {
|
40
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
41
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
42
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
43
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
44
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
45
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
46
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
47
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
48
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
49
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
50
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
51
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
52
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
53
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
54
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
55
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
56
|
+
};
|
57
|
+
|
58
|
+
void bns_dump(const bntseq_t *bns, const char *prefix)
|
59
|
+
{
|
60
|
+
char str[1024];
|
61
|
+
FILE *fp;
|
62
|
+
int i;
|
63
|
+
{ // dump .ann
|
64
|
+
strcpy(str, prefix); strcat(str, ".ann");
|
65
|
+
fp = xopen(str, "w");
|
66
|
+
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
|
67
|
+
for (i = 0; i != bns->n_seqs; ++i) {
|
68
|
+
bntann1_t *p = bns->anns + i;
|
69
|
+
fprintf(fp, "%d %s", p->gi, p->name);
|
70
|
+
if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
|
71
|
+
else fprintf(fp, "\n");
|
72
|
+
fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
|
73
|
+
}
|
74
|
+
fclose(fp);
|
75
|
+
}
|
76
|
+
{ // dump .amb
|
77
|
+
strcpy(str, prefix); strcat(str, ".amb");
|
78
|
+
fp = xopen(str, "w");
|
79
|
+
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
|
80
|
+
for (i = 0; i != bns->n_holes; ++i) {
|
81
|
+
bntamb1_t *p = bns->ambs + i;
|
82
|
+
fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
|
83
|
+
}
|
84
|
+
fclose(fp);
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
|
89
|
+
{
|
90
|
+
char str[1024];
|
91
|
+
FILE *fp;
|
92
|
+
bntseq_t *bns;
|
93
|
+
long long xx;
|
94
|
+
int i;
|
95
|
+
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
96
|
+
{ // read .ann
|
97
|
+
fp = xopen(ann_filename, "r");
|
98
|
+
fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
|
99
|
+
bns->l_pac = xx;
|
100
|
+
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
|
101
|
+
for (i = 0; i < bns->n_seqs; ++i) {
|
102
|
+
bntann1_t *p = bns->anns + i;
|
103
|
+
char *q = str;
|
104
|
+
int c;
|
105
|
+
// read gi and sequence name
|
106
|
+
fscanf(fp, "%u%s", &p->gi, str);
|
107
|
+
p->name = strdup(str);
|
108
|
+
// read fasta comments
|
109
|
+
while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
|
110
|
+
*q = 0;
|
111
|
+
if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
|
112
|
+
else p->anno = strdup("");
|
113
|
+
// read the rest
|
114
|
+
fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
|
115
|
+
p->offset = xx;
|
116
|
+
}
|
117
|
+
fclose(fp);
|
118
|
+
}
|
119
|
+
{ // read .amb
|
120
|
+
int64_t l_pac;
|
121
|
+
int32_t n_seqs;
|
122
|
+
fp = xopen(amb_filename, "r");
|
123
|
+
fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
|
124
|
+
l_pac = xx;
|
125
|
+
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
126
|
+
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
|
127
|
+
for (i = 0; i < bns->n_holes; ++i) {
|
128
|
+
bntamb1_t *p = bns->ambs + i;
|
129
|
+
fscanf(fp, "%lld%d%s", &xx, &p->len, str);
|
130
|
+
p->offset = xx;
|
131
|
+
p->amb = str[0];
|
132
|
+
}
|
133
|
+
fclose(fp);
|
134
|
+
}
|
135
|
+
{ // open .pac
|
136
|
+
bns->fp_pac = xopen(pac_filename, "rb");
|
137
|
+
}
|
138
|
+
return bns;
|
139
|
+
}
|
140
|
+
|
141
|
+
bntseq_t *bns_restore(const char *prefix)
|
142
|
+
{
|
143
|
+
char ann_filename[1024], amb_filename[1024], pac_filename[1024];
|
144
|
+
strcat(strcpy(ann_filename, prefix), ".ann");
|
145
|
+
strcat(strcpy(amb_filename, prefix), ".amb");
|
146
|
+
strcat(strcpy(pac_filename, prefix), ".pac");
|
147
|
+
return bns_restore_core(ann_filename, amb_filename, pac_filename);
|
148
|
+
}
|
149
|
+
|
150
|
+
void bns_destroy(bntseq_t *bns)
|
151
|
+
{
|
152
|
+
if (bns == 0) return;
|
153
|
+
else {
|
154
|
+
int i;
|
155
|
+
if (bns->fp_pac) fclose(bns->fp_pac);
|
156
|
+
free(bns->ambs);
|
157
|
+
for (i = 0; i < bns->n_seqs; ++i) {
|
158
|
+
free(bns->anns[i].name);
|
159
|
+
free(bns->anns[i].anno);
|
160
|
+
}
|
161
|
+
free(bns->anns);
|
162
|
+
free(bns);
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
|
167
|
+
{
|
168
|
+
kseq_t *seq;
|
169
|
+
char name[1024];
|
170
|
+
bntseq_t *bns;
|
171
|
+
bntamb1_t *q;
|
172
|
+
int l_buf;
|
173
|
+
unsigned char buf[0x10000];
|
174
|
+
int32_t m_seqs, m_holes, l, i;
|
175
|
+
FILE *fp;
|
176
|
+
|
177
|
+
// initialization
|
178
|
+
seq = kseq_init(fp_fa);
|
179
|
+
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
180
|
+
bns->seed = 11; // fixed seed for random generator
|
181
|
+
srand48(bns->seed);
|
182
|
+
m_seqs = m_holes = 8;
|
183
|
+
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
|
184
|
+
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
|
185
|
+
q = bns->ambs;
|
186
|
+
l_buf = 0;
|
187
|
+
strcpy(name, prefix); strcat(name, ".pac");
|
188
|
+
fp = xopen(name, "wb");
|
189
|
+
memset(buf, 0, 0x10000);
|
190
|
+
// read sequences
|
191
|
+
while ((l = kseq_read(seq)) >= 0) {
|
192
|
+
bntann1_t *p;
|
193
|
+
int lasts;
|
194
|
+
if (bns->n_seqs == m_seqs) {
|
195
|
+
m_seqs <<= 1;
|
196
|
+
bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
|
197
|
+
}
|
198
|
+
p = bns->anns + bns->n_seqs;
|
199
|
+
p->name = strdup((char*)seq->name.s);
|
200
|
+
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
|
201
|
+
p->gi = 0; p->len = l;
|
202
|
+
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
203
|
+
p->n_ambs = 0;
|
204
|
+
for (i = 0, lasts = 0; i < l; ++i) {
|
205
|
+
int c = nst_nt4_table[(int)seq->seq.s[i]];
|
206
|
+
if (c >= 4) { // N
|
207
|
+
if (lasts == seq->seq.s[i]) { // contiguous N
|
208
|
+
++q->len;
|
209
|
+
} else {
|
210
|
+
if (bns->n_holes == m_holes) {
|
211
|
+
m_holes <<= 1;
|
212
|
+
bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
|
213
|
+
}
|
214
|
+
q = bns->ambs + bns->n_holes;
|
215
|
+
q->len = 1;
|
216
|
+
q->offset = p->offset + i;
|
217
|
+
q->amb = seq->seq.s[i];
|
218
|
+
++p->n_ambs;
|
219
|
+
++bns->n_holes;
|
220
|
+
}
|
221
|
+
}
|
222
|
+
lasts = seq->seq.s[i];
|
223
|
+
{ // fill buffer
|
224
|
+
if (c >= 4) c = lrand48()&0x3;
|
225
|
+
if (l_buf == 0x40000) {
|
226
|
+
fwrite(buf, 1, 0x10000, fp);
|
227
|
+
memset(buf, 0, 0x10000);
|
228
|
+
l_buf = 0;
|
229
|
+
}
|
230
|
+
buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
|
231
|
+
++l_buf;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
++bns->n_seqs;
|
235
|
+
bns->l_pac += seq->seq.l;
|
236
|
+
}
|
237
|
+
xassert(bns->l_pac, "zero length sequence.");
|
238
|
+
{ // finalize .pac file
|
239
|
+
ubyte_t ct;
|
240
|
+
fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
|
241
|
+
// the following codes make the pac file size always (l_pac/4+1+1)
|
242
|
+
if (bns->l_pac % 4 == 0) {
|
243
|
+
ct = 0;
|
244
|
+
fwrite(&ct, 1, 1, fp);
|
245
|
+
}
|
246
|
+
ct = bns->l_pac % 4;
|
247
|
+
fwrite(&ct, 1, 1, fp);
|
248
|
+
// close .pac file
|
249
|
+
fclose(fp);
|
250
|
+
}
|
251
|
+
bns_dump(bns, prefix);
|
252
|
+
bns_destroy(bns);
|
253
|
+
kseq_destroy(seq);
|
254
|
+
}
|
255
|
+
|
256
|
+
int bwa_fa2pac(int argc, char *argv[])
|
257
|
+
{
|
258
|
+
gzFile fp;
|
259
|
+
if (argc < 2) {
|
260
|
+
fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
|
261
|
+
return 1;
|
262
|
+
}
|
263
|
+
fp = xzopen(argv[1], "r");
|
264
|
+
bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
|
265
|
+
gzclose(fp);
|
266
|
+
return 0;
|
267
|
+
}
|
268
|
+
|
269
|
+
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
|
270
|
+
{
|
271
|
+
int left, mid, right, nn;
|
272
|
+
if (pac_coor >= bns->l_pac)
|
273
|
+
err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
|
274
|
+
// binary search for the sequence ID. Note that this is a bit different from the following one...
|
275
|
+
left = 0; mid = 0; right = bns->n_seqs;
|
276
|
+
while (left < right) {
|
277
|
+
mid = (left + right) >> 1;
|
278
|
+
if (pac_coor >= bns->anns[mid].offset) {
|
279
|
+
if (mid == bns->n_seqs - 1) break;
|
280
|
+
if (pac_coor < bns->anns[mid+1].offset) break;
|
281
|
+
left = mid + 1;
|
282
|
+
} else right = mid;
|
283
|
+
}
|
284
|
+
*real_seq = mid;
|
285
|
+
// binary search for holes
|
286
|
+
left = 0; right = bns->n_holes; nn = 0;
|
287
|
+
while (left < right) {
|
288
|
+
int64_t mid = (left + right) >> 1;
|
289
|
+
if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
290
|
+
else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
|
291
|
+
else { // overlap
|
292
|
+
if (pac_coor >= bns->ambs[mid].offset) {
|
293
|
+
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
294
|
+
bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
|
295
|
+
} else {
|
296
|
+
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
297
|
+
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
|
298
|
+
}
|
299
|
+
break;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
return nn;
|
303
|
+
}
|