bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/README ADDED
@@ -0,0 +1,29 @@
1
+ Released packages can be downloaded from SourceForge.net:
2
+
3
+ http://sourceforge.net/projects/bio-bwa/files/
4
+
5
+ Introduction and FAQ are available at:
6
+
7
+ http://bio-bwa.sourceforge.net
8
+
9
+ Manual page at:
10
+
11
+ http://bio-bwa.sourceforge.net/bwa.shtml
12
+
13
+ Mailing list:
14
+
15
+ bio-bwa-help@lists.sourceforge.net
16
+
17
+ To sign up:
18
+
19
+ http://sourceforge.net/mail/?group_id=276243
20
+
21
+ Publications (Open Access):
22
+
23
+ http://www.ncbi.nlm.nih.gov/pubmed/20080505
24
+ http://www.ncbi.nlm.nih.gov/pubmed/19451168
25
+
26
+ Incomplete list of citations (via HubMed.org):
27
+
28
+ http://www.hubmed.org/references.cgi?uids=20080505
29
+ http://www.hubmed.org/references.cgi?uids=19451168
data/ext/bamlite.c ADDED
@@ -0,0 +1,155 @@
1
+ #include <stdlib.h>
2
+ #include <ctype.h>
3
+ #include <string.h>
4
+ #include <stdio.h>
5
+ #include "bamlite.h"
6
+
7
+ /*********************
8
+ * from bam_endian.c *
9
+ *********************/
10
+
11
+ static inline int bam_is_big_endian()
12
+ {
13
+ long one= 1;
14
+ return !(*((char *)(&one)));
15
+ }
16
+ static inline uint16_t bam_swap_endian_2(uint16_t v)
17
+ {
18
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
19
+ }
20
+ static inline void *bam_swap_endian_2p(void *x)
21
+ {
22
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
23
+ return x;
24
+ }
25
+ static inline uint32_t bam_swap_endian_4(uint32_t v)
26
+ {
27
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
28
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
29
+ }
30
+ static inline void *bam_swap_endian_4p(void *x)
31
+ {
32
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
33
+ return x;
34
+ }
35
+ static inline uint64_t bam_swap_endian_8(uint64_t v)
36
+ {
37
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
38
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
39
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
40
+ }
41
+ static inline void *bam_swap_endian_8p(void *x)
42
+ {
43
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
44
+ return x;
45
+ }
46
+
47
+ /**************
48
+ * from bam.c *
49
+ **************/
50
+
51
+ int bam_is_be;
52
+
53
+ bam_header_t *bam_header_init()
54
+ {
55
+ bam_is_be = bam_is_big_endian();
56
+ return (bam_header_t*)calloc(1, sizeof(bam_header_t));
57
+ }
58
+
59
+ void bam_header_destroy(bam_header_t *header)
60
+ {
61
+ int32_t i;
62
+ if (header == 0) return;
63
+ if (header->target_name) {
64
+ for (i = 0; i < header->n_targets; ++i)
65
+ free(header->target_name[i]);
66
+ free(header->target_name);
67
+ free(header->target_len);
68
+ }
69
+ free(header->text);
70
+ free(header);
71
+ }
72
+
73
+ bam_header_t *bam_header_read(bamFile fp)
74
+ {
75
+ bam_header_t *header;
76
+ char buf[4];
77
+ int magic_len;
78
+ int32_t i = 1, name_len;
79
+ // read "BAM1"
80
+ magic_len = bam_read(fp, buf, 4);
81
+ if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
82
+ fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
83
+ return 0;
84
+ }
85
+ header = bam_header_init();
86
+ // read plain text and the number of reference sequences
87
+ bam_read(fp, &header->l_text, 4);
88
+ if (bam_is_be) bam_swap_endian_4p(&header->l_text);
89
+ header->text = (char*)calloc(header->l_text + 1, 1);
90
+ bam_read(fp, header->text, header->l_text);
91
+ bam_read(fp, &header->n_targets, 4);
92
+ if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
93
+ // read reference sequence names and lengths
94
+ header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
95
+ header->target_len = (uint32_t*)calloc(header->n_targets, 4);
96
+ for (i = 0; i != header->n_targets; ++i) {
97
+ bam_read(fp, &name_len, 4);
98
+ if (bam_is_be) bam_swap_endian_4p(&name_len);
99
+ header->target_name[i] = (char*)calloc(name_len, 1);
100
+ bam_read(fp, header->target_name[i], name_len);
101
+ bam_read(fp, &header->target_len[i], 4);
102
+ if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
103
+ }
104
+ return header;
105
+ }
106
+
107
+ static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
108
+ {
109
+ uint8_t *s;
110
+ uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
111
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
112
+ for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
113
+ while (s < data + data_len) {
114
+ uint8_t type;
115
+ s += 2; // skip key
116
+ type = toupper(*s); ++s; // skip type
117
+ if (type == 'C' || type == 'A') ++s;
118
+ else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
119
+ else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
120
+ else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
121
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
122
+ }
123
+ }
124
+
125
+ int bam_read1(bamFile fp, bam1_t *b)
126
+ {
127
+ bam1_core_t *c = &b->core;
128
+ int32_t block_len, ret, i;
129
+ uint32_t x[8];
130
+
131
+ if ((ret = bam_read(fp, &block_len, 4)) != 4) {
132
+ if (ret == 0) return -1; // normal end-of-file
133
+ else return -2; // truncated
134
+ }
135
+ if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
136
+ if (bam_is_be) {
137
+ bam_swap_endian_4p(&block_len);
138
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
139
+ }
140
+ c->tid = x[0]; c->pos = x[1];
141
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
142
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
143
+ c->l_qseq = x[4];
144
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
145
+ b->data_len = block_len - sizeof(bam1_core_t);
146
+ if (b->m_data < b->data_len) {
147
+ b->m_data = b->data_len;
148
+ kroundup32(b->m_data);
149
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
150
+ }
151
+ if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
152
+ b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
153
+ if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
154
+ return 4 + block_len;
155
+ }
data/ext/bamlite.h ADDED
@@ -0,0 +1,94 @@
1
+ #ifndef BAMLITE_H_
2
+ #define BAMLITE_H_
3
+
4
+ #include <stdint.h>
5
+ #include <zlib.h>
6
+
7
+ typedef gzFile bamFile;
8
+ #define bam_open(fn, mode) gzopen(fn, mode)
9
+ #define bam_dopen(fd, mode) gzdopen(fd, mode)
10
+ #define bam_close(fp) gzclose(fp)
11
+ #define bam_read(fp, buf, size) gzread(fp, buf, size)
12
+
13
+ typedef struct {
14
+ int32_t n_targets;
15
+ char **target_name;
16
+ uint32_t *target_len;
17
+ size_t l_text, n_text;
18
+ char *text;
19
+ } bam_header_t;
20
+
21
+ #define BAM_FPAIRED 1
22
+ #define BAM_FPROPER_PAIR 2
23
+ #define BAM_FUNMAP 4
24
+ #define BAM_FMUNMAP 8
25
+ #define BAM_FREVERSE 16
26
+ #define BAM_FMREVERSE 32
27
+ #define BAM_FREAD1 64
28
+ #define BAM_FREAD2 128
29
+ #define BAM_FSECONDARY 256
30
+ #define BAM_FQCFAIL 512
31
+ #define BAM_FDUP 1024
32
+
33
+ #define BAM_CIGAR_SHIFT 4
34
+ #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
35
+
36
+ #define BAM_CMATCH 0
37
+ #define BAM_CINS 1
38
+ #define BAM_CDEL 2
39
+ #define BAM_CREF_SKIP 3
40
+ #define BAM_CSOFT_CLIP 4
41
+ #define BAM_CHARD_CLIP 5
42
+ #define BAM_CPAD 6
43
+
44
+ typedef struct {
45
+ int32_t tid;
46
+ int32_t pos;
47
+ uint32_t bin:16, qual:8, l_qname:8;
48
+ uint32_t flag:16, n_cigar:16;
49
+ int32_t l_qseq;
50
+ int32_t mtid;
51
+ int32_t mpos;
52
+ int32_t isize;
53
+ } bam1_core_t;
54
+
55
+ typedef struct {
56
+ bam1_core_t core;
57
+ int l_aux, data_len, m_data;
58
+ uint8_t *data;
59
+ } bam1_t;
60
+
61
+ #ifndef kroundup32
62
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
63
+ #endif
64
+
65
+ #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
66
+ #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
67
+ #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
68
+ #define bam1_qname(b) ((char*)((b)->data))
69
+ #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
70
+ #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
71
+ #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
72
+ #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
73
+
74
+ #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
75
+ #define bam_destroy1(b) do { \
76
+ if (b) { free((b)->data); free(b); } \
77
+ } while (0)
78
+
79
+ extern int bam_is_be;
80
+
81
+ #ifdef __cplusplus
82
+ extern "C" {
83
+ #endif
84
+
85
+ bam_header_t *bam_header_init(void);
86
+ void bam_header_destroy(bam_header_t *header);
87
+ bam_header_t *bam_header_read(bamFile fp);
88
+ int bam_read1(bamFile fp, bam1_t *b);
89
+
90
+ #ifdef __cplusplus
91
+ }
92
+ #endif
93
+
94
+ #endif
data/ext/bntseq.c ADDED
@@ -0,0 +1,303 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdio.h>
29
+ #include <stdlib.h>
30
+ #include <string.h>
31
+ #include <zlib.h>
32
+ #include "bntseq.h"
33
+ #include "main.h"
34
+ #include "utils.h"
35
+
36
+ #include "kseq.h"
37
+ KSEQ_INIT(gzFile, gzread)
38
+
39
+ unsigned char nst_nt4_table[256] = {
40
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
41
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
43
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
44
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
45
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
47
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
48
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
56
+ };
57
+
58
+ void bns_dump(const bntseq_t *bns, const char *prefix)
59
+ {
60
+ char str[1024];
61
+ FILE *fp;
62
+ int i;
63
+ { // dump .ann
64
+ strcpy(str, prefix); strcat(str, ".ann");
65
+ fp = xopen(str, "w");
66
+ fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
67
+ for (i = 0; i != bns->n_seqs; ++i) {
68
+ bntann1_t *p = bns->anns + i;
69
+ fprintf(fp, "%d %s", p->gi, p->name);
70
+ if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
71
+ else fprintf(fp, "\n");
72
+ fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
73
+ }
74
+ fclose(fp);
75
+ }
76
+ { // dump .amb
77
+ strcpy(str, prefix); strcat(str, ".amb");
78
+ fp = xopen(str, "w");
79
+ fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
80
+ for (i = 0; i != bns->n_holes; ++i) {
81
+ bntamb1_t *p = bns->ambs + i;
82
+ fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
83
+ }
84
+ fclose(fp);
85
+ }
86
+ }
87
+
88
+ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
89
+ {
90
+ char str[1024];
91
+ FILE *fp;
92
+ bntseq_t *bns;
93
+ long long xx;
94
+ int i;
95
+ bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
96
+ { // read .ann
97
+ fp = xopen(ann_filename, "r");
98
+ fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
99
+ bns->l_pac = xx;
100
+ bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
101
+ for (i = 0; i < bns->n_seqs; ++i) {
102
+ bntann1_t *p = bns->anns + i;
103
+ char *q = str;
104
+ int c;
105
+ // read gi and sequence name
106
+ fscanf(fp, "%u%s", &p->gi, str);
107
+ p->name = strdup(str);
108
+ // read fasta comments
109
+ while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
110
+ *q = 0;
111
+ if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
112
+ else p->anno = strdup("");
113
+ // read the rest
114
+ fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
115
+ p->offset = xx;
116
+ }
117
+ fclose(fp);
118
+ }
119
+ { // read .amb
120
+ int64_t l_pac;
121
+ int32_t n_seqs;
122
+ fp = xopen(amb_filename, "r");
123
+ fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
124
+ l_pac = xx;
125
+ xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
126
+ bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
127
+ for (i = 0; i < bns->n_holes; ++i) {
128
+ bntamb1_t *p = bns->ambs + i;
129
+ fscanf(fp, "%lld%d%s", &xx, &p->len, str);
130
+ p->offset = xx;
131
+ p->amb = str[0];
132
+ }
133
+ fclose(fp);
134
+ }
135
+ { // open .pac
136
+ bns->fp_pac = xopen(pac_filename, "rb");
137
+ }
138
+ return bns;
139
+ }
140
+
141
+ bntseq_t *bns_restore(const char *prefix)
142
+ {
143
+ char ann_filename[1024], amb_filename[1024], pac_filename[1024];
144
+ strcat(strcpy(ann_filename, prefix), ".ann");
145
+ strcat(strcpy(amb_filename, prefix), ".amb");
146
+ strcat(strcpy(pac_filename, prefix), ".pac");
147
+ return bns_restore_core(ann_filename, amb_filename, pac_filename);
148
+ }
149
+
150
+ void bns_destroy(bntseq_t *bns)
151
+ {
152
+ if (bns == 0) return;
153
+ else {
154
+ int i;
155
+ if (bns->fp_pac) fclose(bns->fp_pac);
156
+ free(bns->ambs);
157
+ for (i = 0; i < bns->n_seqs; ++i) {
158
+ free(bns->anns[i].name);
159
+ free(bns->anns[i].anno);
160
+ }
161
+ free(bns->anns);
162
+ free(bns);
163
+ }
164
+ }
165
+
166
+ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
167
+ {
168
+ kseq_t *seq;
169
+ char name[1024];
170
+ bntseq_t *bns;
171
+ bntamb1_t *q;
172
+ int l_buf;
173
+ unsigned char buf[0x10000];
174
+ int32_t m_seqs, m_holes, l, i;
175
+ FILE *fp;
176
+
177
+ // initialization
178
+ seq = kseq_init(fp_fa);
179
+ bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
180
+ bns->seed = 11; // fixed seed for random generator
181
+ srand48(bns->seed);
182
+ m_seqs = m_holes = 8;
183
+ bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
184
+ bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
185
+ q = bns->ambs;
186
+ l_buf = 0;
187
+ strcpy(name, prefix); strcat(name, ".pac");
188
+ fp = xopen(name, "wb");
189
+ memset(buf, 0, 0x10000);
190
+ // read sequences
191
+ while ((l = kseq_read(seq)) >= 0) {
192
+ bntann1_t *p;
193
+ int lasts;
194
+ if (bns->n_seqs == m_seqs) {
195
+ m_seqs <<= 1;
196
+ bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
197
+ }
198
+ p = bns->anns + bns->n_seqs;
199
+ p->name = strdup((char*)seq->name.s);
200
+ p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
201
+ p->gi = 0; p->len = l;
202
+ p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
203
+ p->n_ambs = 0;
204
+ for (i = 0, lasts = 0; i < l; ++i) {
205
+ int c = nst_nt4_table[(int)seq->seq.s[i]];
206
+ if (c >= 4) { // N
207
+ if (lasts == seq->seq.s[i]) { // contiguous N
208
+ ++q->len;
209
+ } else {
210
+ if (bns->n_holes == m_holes) {
211
+ m_holes <<= 1;
212
+ bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
213
+ }
214
+ q = bns->ambs + bns->n_holes;
215
+ q->len = 1;
216
+ q->offset = p->offset + i;
217
+ q->amb = seq->seq.s[i];
218
+ ++p->n_ambs;
219
+ ++bns->n_holes;
220
+ }
221
+ }
222
+ lasts = seq->seq.s[i];
223
+ { // fill buffer
224
+ if (c >= 4) c = lrand48()&0x3;
225
+ if (l_buf == 0x40000) {
226
+ fwrite(buf, 1, 0x10000, fp);
227
+ memset(buf, 0, 0x10000);
228
+ l_buf = 0;
229
+ }
230
+ buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
231
+ ++l_buf;
232
+ }
233
+ }
234
+ ++bns->n_seqs;
235
+ bns->l_pac += seq->seq.l;
236
+ }
237
+ xassert(bns->l_pac, "zero length sequence.");
238
+ { // finalize .pac file
239
+ ubyte_t ct;
240
+ fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
241
+ // the following codes make the pac file size always (l_pac/4+1+1)
242
+ if (bns->l_pac % 4 == 0) {
243
+ ct = 0;
244
+ fwrite(&ct, 1, 1, fp);
245
+ }
246
+ ct = bns->l_pac % 4;
247
+ fwrite(&ct, 1, 1, fp);
248
+ // close .pac file
249
+ fclose(fp);
250
+ }
251
+ bns_dump(bns, prefix);
252
+ bns_destroy(bns);
253
+ kseq_destroy(seq);
254
+ }
255
+
256
+ int bwa_fa2pac(int argc, char *argv[])
257
+ {
258
+ gzFile fp;
259
+ if (argc < 2) {
260
+ fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
261
+ return 1;
262
+ }
263
+ fp = xzopen(argv[1], "r");
264
+ bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
265
+ gzclose(fp);
266
+ return 0;
267
+ }
268
+
269
+ int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
270
+ {
271
+ int left, mid, right, nn;
272
+ if (pac_coor >= bns->l_pac)
273
+ err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
274
+ // binary search for the sequence ID. Note that this is a bit different from the following one...
275
+ left = 0; mid = 0; right = bns->n_seqs;
276
+ while (left < right) {
277
+ mid = (left + right) >> 1;
278
+ if (pac_coor >= bns->anns[mid].offset) {
279
+ if (mid == bns->n_seqs - 1) break;
280
+ if (pac_coor < bns->anns[mid+1].offset) break;
281
+ left = mid + 1;
282
+ } else right = mid;
283
+ }
284
+ *real_seq = mid;
285
+ // binary search for holes
286
+ left = 0; right = bns->n_holes; nn = 0;
287
+ while (left < right) {
288
+ int64_t mid = (left + right) >> 1;
289
+ if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
290
+ else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
291
+ else { // overlap
292
+ if (pac_coor >= bns->ambs[mid].offset) {
293
+ nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
294
+ bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
295
+ } else {
296
+ nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
297
+ bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
298
+ }
299
+ break;
300
+ }
301
+ }
302
+ return nn;
303
+ }