naapam 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. naapam-0.1.12/.python-version +1 -0
  2. naapam-0.1.12/CMakeLists.txt +8 -0
  3. naapam-0.1.12/LICENSE.md +9 -0
  4. naapam-0.1.12/PKG-INFO +34 -0
  5. naapam-0.1.12/README.md +16 -0
  6. naapam-0.1.12/pyproject.toml +54 -0
  7. naapam-0.1.12/src/correct_micro_homology.awk +178 -0
  8. naapam-0.1.12/src/headers/align.h +403 -0
  9. naapam-0.1.12/src/headers/parser.h +120 -0
  10. naapam-0.1.12/src/main.cpp +202 -0
  11. naapam-0.1.12/src/naapam/__init__.py +0 -0
  12. naapam-0.1.12/src/naapam/align.py +940 -0
  13. naapam-0.1.12/src/naapam/analyze.py +327 -0
  14. naapam-0.1.12/src/naapam/draw.py +94 -0
  15. naapam-0.1.12/src/naapam/filter_configs/__init__.py +0 -0
  16. naapam-0.1.12/src/naapam/filter_configs/align.cluster_func_control_by_mutant.yaml +1 -0
  17. naapam-0.1.12/src/naapam/filter_configs/align.filter_low_quality_barcode.yaml +4 -0
  18. naapam-0.1.12/src/naapam/filter_configs/align.filter_low_quality_mutant.yaml +6 -0
  19. naapam-0.1.12/src/naapam/filter_configs/align.filter_nofunc_control.yaml +48 -0
  20. naapam-0.1.12/src/naapam/filter_configs/align.filter_treat.yaml +15 -0
  21. naapam-0.1.12/src/naapam/filter_configs/align.generate_reference.yaml +1 -0
  22. naapam-0.1.12/src/naapam/filter_configs/analyze.collect_data.yaml +1 -0
  23. naapam-0.1.12/src/naapam/filter_configs/analyze.correct_alg.yaml +1 -0
  24. naapam-0.1.12/src/naapam/filter_configs/analyze.filter_mutant.yaml +4 -0
  25. naapam-0.1.12/src/naapam/filter_configs/analyze.filter_ref.yaml +17 -0
  26. naapam-0.1.12/src/naapam/filter_configs/analyze.stat_mutant.yaml +1 -0
  27. naapam-0.1.12/src/naapam/filter_configs/analyze.stat_ref.yaml +1 -0
  28. naapam-0.1.12/src/naapam/fit.py +65 -0
  29. naapam-0.1.12/src/naapam/mix.py +141 -0
  30. naapam-0.1.12/src/naapam/notebooks/__init__.py +0 -0
  31. naapam-0.1.12/src/naapam/notebooks/align.ipynb +521 -0
  32. naapam-0.1.12/src/naapam/notebooks/analyze.ipynb +301 -0
  33. naapam-0.1.12/src/naapam/parse.py +529 -0
  34. naapam-0.1.12/src/naapam/plasmids/__init__.py +0 -0
  35. naapam-0.1.12/src/naapam/plasmids/a1.csv +12422 -0
  36. naapam-0.1.12/src/naapam/plasmids/a2.csv +12422 -0
  37. naapam-0.1.12/src/naapam/plasmids/a3.csv +12422 -0
  38. naapam-0.1.12/src/naapam/plasmids/final_hgsgrna_libb_all_0811-NGG.csv +38714 -0
  39. naapam-0.1.12/src/naapam/plasmids/final_hgsgrna_libb_all_0811_NAA_scaffold_nbt.csv +38714 -0
  40. naapam-0.1.12/src/naapam/plasmids/g1n.csv +12422 -0
  41. naapam-0.1.12/src/naapam/plasmids/g2n.csv +12422 -0
  42. naapam-0.1.12/src/naapam/plasmids/g3n.csv +12422 -0
  43. naapam-0.1.12/src/naapam/scripts/__init__.py +13 -0
  44. naapam-0.1.12/src/naapam/scripts/align.sh +41 -0
  45. naapam-0.1.12/src/naapam/scripts/parse.sh +4 -0
  46. naapam-0.1.12/src/naapam/utils.py +439 -0
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,8 @@
1
+ cmake_minimum_required(VERSION 3.20)
2
+
3
+ project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
4
+
5
+ add_executable(rearrangement src/main.cpp)
6
+ target_compile_options(rearrangement PRIVATE -O3 -std=c++20 -static)
7
+ install(TARGETS rearrangement RUNTIME DESTINATION ${SKBUILD_SCRIPTS_DIR})
8
+ install(PROGRAMS src/correct_micro_homology.awk DESTINATION ${SKBUILD_SCRIPTS_DIR})
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jingwei Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
naapam-0.1.12/PKG-INFO ADDED
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: naapam
3
+ Version: 0.1.12
4
+ Summary: Chip-based CRISPR analysis
5
+ Author-Email: ljw <ljw2017@sjtu.edu.cn>
6
+ License-Expression: MIT
7
+ License-File: LICENSE.md
8
+ Requires-Python: >=3.13
9
+ Requires-Dist: biopython>=1.86
10
+ Requires-Dist: pandas[feather]>=3.0.0
11
+ Requires-Dist: plotnine>=0.15.2
12
+ Requires-Dist: pysam>=0.23.3
13
+ Requires-Dist: pyyaml>=6.0.3
14
+ Requires-Dist: statsmodels>=0.14.6
15
+ Provides-Extra: dev
16
+ Requires-Dist: cibuildwheel>=3.3.1; extra == "dev"
17
+ Description-Content-Type: text/markdown
18
+
19
+ # TODO
20
+
21
+ - [ ] 仔细调整max_freq_temN
22
+ - [ ] 实在不行,保底拟合矫正del长度
23
+ - [ ] 利用可视化技术精确定位不好的reads
24
+ - [ ] mermaid diagram README for workflow
25
+
26
+ # Package
27
+
28
+ - [ ] format jupyter-notebook
29
+ - [ ] github action
30
+
31
+ # Dependencies
32
+
33
+ - bowtie2
34
+ - gawk
@@ -0,0 +1,16 @@
1
+ # TODO
2
+
3
+ - [ ] 仔细调整max_freq_temN
4
+ - [ ] 实在不行,保底拟合矫正del长度
5
+ - [ ] 利用可视化技术精确定位不好的reads
6
+ - [ ] mermaid diagram README for workflow
7
+
8
+ # Package
9
+
10
+ - [ ] format jupyter-notebook
11
+ - [ ] github action
12
+
13
+ # Dependencies
14
+
15
+ - bowtie2
16
+ - gawk
@@ -0,0 +1,54 @@
1
+ [project]
2
+ name = "naapam"
3
+ version = "0.1.12"
4
+ description = "Chip-based CRISPR analysis"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ license-files = ["LICENSE.md"]
8
+ authors = [
9
+ { name = "ljw", email = "ljw2017@sjtu.edu.cn" }
10
+ ]
11
+ requires-python = ">=3.13"
12
+ dependencies = [
13
+ "biopython>=1.86",
14
+ "pandas[feather]>=3.0.0",
15
+ "plotnine>=0.15.2",
16
+ "pysam>=0.23.3",
17
+ "pyyaml>=6.0.3",
18
+ "statsmodels>=0.14.6",
19
+ ]
20
+
21
+ [project.scripts]
22
+ naapam-parse = "naapam.parse:main"
23
+ naapam-parse-parallel = "naapam.scripts:parse"
24
+ naapam-align-parallel = "naapam.scripts:align"
25
+
26
+ [project.optional-dependencies]
27
+ dev = [
28
+ "cibuildwheel>=3.3.1",
29
+ ]
30
+
31
+ [tool.scikit-build]
32
+ cmake.build-type = "Release"
33
+ minimum-version = "build-system.requires"
34
+ build-dir = "build/{wheel_tag}"
35
+ sdist.exclude = [
36
+ ".github",
37
+ "exponential_to_straight",
38
+ ".gitignore",
39
+ "download_rearr.sh",
40
+ "uv.lock",
41
+ "release.sh"
42
+ ]
43
+ sdist.include = [
44
+ "/src/headers",
45
+ "/src/main.cpp",
46
+ "/src/correct_micro_homology.awk",
47
+ ]
48
+
49
+ [tool.uv]
50
+ cache-keys = [{ file = "pyproject.toml" }, { file = "src/**/*.{h,c,hpp,cpp}" }, { file = "CMakeLists.txt" }]
51
+
52
+ [build-system]
53
+ requires = ["scikit-build-core>=0.10"]
54
+ build-backend = "scikit_build_core.build"
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env -S gawk -f
2
+
3
+ # Usage: correct_micro_homology.awk -- reference_file direction_file < rearrangement_output
4
+
5
+ # reference_file is the same as the input of rearrangement. For each row of reference_file, direction_file has a row of fields being either up or down. Each field corresponds a junction of two adjacent references in the row of reference_file. For up\down, correct_micro_homology.awk try to remove the deletion or templated insertion of the up\down-stream DSB end. This is achieved by modifying the alignment up to the equivalence of microhomology.
6
+
7
+ # The header line of each alignment output by rearrangement only contains idx, #, score, id. correct_micro_homology.awk enriches the header by adding unaligned part of query and aligned ranges of both reference and query. The cut sites at junctions of adjacent references are also appended to the header line.
8
+
9
+ BEGIN{
10
+ FS = "\t"
11
+ refFile = ARGV[1]
12
+ directionFile = ARGV[2]
13
+ ref_id = 0
14
+ # Read information from refFile.
15
+ while (getline ref < refFile) {
16
+ getline direction < directionFile
17
+ n = split(ref, ref_arr, "\t")
18
+ split(direction, direction_type[ref_id], "\t")
19
+ ref_accum_len = 0
20
+ for (i = 3; i < n; i += 3) {
21
+ cut1s[ref_id, i / 3] = ref_arr[i]
22
+ cut1s_accum[ref_id, i / 3] = ref_accum_len + cut1s[ref_id, i / 3]
23
+ ref_accum_len += length(ref_arr[i - 1])
24
+ cut2s[ref_id, i / 3] = ref_arr[i + 1]
25
+ cut2s_accum[ref_id, i / 3] = ref_accum_len + cut2s[ref_id, i / 3]
26
+ }
27
+ ++ref_id
28
+ }
29
+ # Delete refFile and directionFile from arguments, so correct_micro_homology.AWK does not process them.
30
+ for (i = 1; i <= 2; ++i) {
31
+ delete ARGV[i]
32
+ }
33
+ }
34
+
35
+ # From queryline, extract aligned parts (including internal and end deletion gaps of refline) to targets and unaligned parts to inserts.
36
+ function query_patsplit(refs, dashs, queryline, targets, inserts, start, i) {
37
+ start = 1
38
+ for (i = 1; i <= length(refs); ++i) {
39
+ inserts[i - 1] = substr(queryline, start, length(dashs[i - 1]))
40
+ start += length(dashs[i - 1])
41
+ targets[i] = substr(queryline, start, length(refs[i]))
42
+ start += length(refs[i])
43
+ }
44
+ inserts[length(refs)] = substr(queryline, start, length(dashs[length(refs)]))
45
+ }
46
+
47
+ # Extract the aligned range of reference.
48
+ function query_seg_range(target, seg_range, mat) {
49
+ seg_range[1] = match(target, /([ACGTN][-ACGTN]*[ACGTN]|[ACGTN])/, mat) - 1
50
+ seg_range[2] = seg_range[1] + length(mat[0])
51
+ }
52
+
53
+ # Since ref contains gap '-', the actual cut site in ref is push downstream by gaps upstream to it.
54
+ function get_gap_cut(ref, cut, n, segs, gaps, accum_len, j, k) {
55
+ n = patsplit(ref, segs, /[acgtnACGTN]+/, gaps)
56
+ accum_len = 0
57
+ for (j = 1; j <= n; ++j) {
58
+ accum_len += length(segs[j])
59
+ if (accum_len >= cut) {
60
+ break
61
+ }
62
+ }
63
+ gap_cut = cut
64
+ for (k = 1; k < j; ++k) {
65
+ gap_cut += length(gaps[k])
66
+ }
67
+ return gap_cut
68
+ }
69
+
70
+ # Get the length of the longest common prefix\suffix of string1 and string2. fix is either "prefix" or "suffix".
71
+ function longest_common_fix(string1, string2, fix, i, start, rgx) {
72
+ string1 = toupper(string1)
73
+ string2 = toupper(string2)
74
+ for (i = 1; i <= length(string2); ++i) {
75
+ start = fix == "prefix" ? 1 : length(string2) - i + 1
76
+ rgx = fix == "prefix" ? "^" substr(string2, start, i) : substr(string2, start, i) "$"
77
+ if (string1 !~ rgx) {
78
+ return i - 1
79
+ }
80
+ }
81
+ }
82
+
83
+ # Add additional information to the header line, including unaligned part of query and aligned ranges of both reference and query, as well as the cut sites at junctions of adjacent references.
84
+ function print_mark(insert, seg_range, ref, target, ref_block, query_block) {
85
+ printf("%s\t", insert)
86
+ query_pos += length(insert)
87
+ ref_pos += (seg_range[1] == -1 ? 0 : seg_range[1])
88
+ printf("%d\t%d\t", ref_pos, query_pos)
89
+ ref_block = substr(ref, seg_range[1] + 1, seg_range[2] - seg_range[1])
90
+ gsub(/-/, "", ref_block)
91
+ ref_pos += length(ref_block)
92
+ query_block = substr(target, seg_range[1] + 1, seg_range[2] - seg_range[1])
93
+ gsub(/-/, "", query_block)
94
+ query_pos += length(query_block)
95
+ printf("%d\t%d\t", ref_pos, query_pos)
96
+ ref_pos += seg_range[1] == -1 ? length(ref) : length(ref) - seg_range[2]
97
+ }
98
+
99
+ {
100
+ idx = $1
101
+ count = $2
102
+ score = $3
103
+ ref_id = $4
104
+ printf("%d\t%d\t%d\t%d\t", idx, count, score, ref_id)
105
+
106
+ getline refline
107
+ getline queryline
108
+
109
+ patsplit(refline, refs, /[acgtn][-ACGTN]*[acgtn]/, dashs)
110
+ query_patsplit(refs, dashs, queryline, targets, inserts)
111
+
112
+ # Iteration over all reference junctions.
113
+ ref_pos = query_pos = 0
114
+ query_seg_range(targets[1], seg_range1)
115
+ for (i = 1; i < length(refs); ++i) {
116
+ query_seg_range(targets[i + 1], seg_range2)
117
+ # Micro-homology correction is possible only if no unaligned part of query between the alignments of the two adjacent references, and neither reference is skipped.
118
+ if (length(inserts[i]) == 0 && seg_range1[1] != 0 && seg_range2[1] != 0) {
119
+ gap_cut1 = get_gap_cut(refs[i], cut1s[ref_id, i])
120
+ gap_cut2 = get_gap_cut(refs[i + 1], cut2s[ref_id, i])
121
+
122
+ # Determine the correct direction based on both direction_type in directionFile and the actual indel type.
123
+ correct_direct = ""
124
+ if (tolower(direction_type[ref_id][i]) == "up") {
125
+ if (seg_range1[2] < gap_cut1) {
126
+ receiver = substr(refs[i], seg_range1[2] + 1, gap_cut1 - seg_range1[2])
127
+ provider = substr(refs[i + 1], seg_range2[1] + 1, seg_range2[2] - seg_range2[1])
128
+ correct_direct = "prefix"
129
+ } else if (seg_range1[2] > gap_cut1) {
130
+ provider = substr(refs[i], gap_cut1 + 1, seg_range1[2] - gap_cut1)
131
+ receiver = substr(refs[i + 1], 1, seg_range2[1])
132
+ correct_direct = "suffix"
133
+ }
134
+ } else {
135
+ if (seg_range2[1] < gap_cut2) {
136
+ provider = substr(refs[i + 1], seg_range2[1] + 1, gap_cut2 - seg_range2[1])
137
+ receiver = substr(refs[i], seg_range1[2] + 1)
138
+ correct_direct = "prefix"
139
+ } else if (seg_range2[1] > gap_cut2) {
140
+ receiver = substr(refs[i + 1], gap_cut2 + 1, seg_range2[1] - gap_cut2)
141
+ provider = substr(refs[i], seg_range1[1] + 1, seg_range1[2] - seg_range1[1])
142
+ correct_direct = "suffix"
143
+ }
144
+ }
145
+
146
+ # If there is neither deletion nor templated insertion, then neglect the correction step.
147
+ if (correct_direct != "") {
148
+ correct_length = longest_common_fix(receiver, provider, correct_direct)
149
+ target1_split_start = correct_direct == "prefix" ? seg_range1[2] : seg_range1[2] - correct_length
150
+ target2_split_start = correct_direct == "prefix" ? seg_range2[1] : seg_range2[1] - correct_length
151
+ target1_exchange = substr(targets[i], target1_split_start + 1, correct_length)
152
+ target2_exchange = substr(targets[i + 1], target2_split_start + 1, correct_length)
153
+ targets[i] = substr(targets[i], 1, target1_split_start) target2_exchange substr(targets[i], target1_split_start + correct_length + 1)
154
+ targets[i + 1] = substr(targets[i + 1], 1, target2_split_start) target1_exchange substr(targets[i + 1], target2_split_start + correct_length + 1)
155
+
156
+ shift = correct_direct == "prefix" ? correct_length : -correct_length
157
+ seg_range1[2] += shift
158
+ seg_range2[1] += shift
159
+ }
160
+ }
161
+ # Print additional header information upstream to the current junction.
162
+ print_mark(inserts[i - 1], seg_range1, refs[i], targets[i])
163
+ seg_range1[1] = seg_range2[1]
164
+ seg_range1[2] = seg_range2[2]
165
+ }
166
+ # Print addtional header information downstream to the last junction.
167
+ print_mark(inserts[i - 1], seg_range1, refs[i], targets[i])
168
+ printf("%s", inserts[i])
169
+ for (i = 1; i < length(refs); ++i) {
170
+ printf("\t%d\t%d", cut1s_accum[ref_id, i], cut2s_accum[ref_id, i])
171
+ }
172
+ # Construct corrected queryline.
173
+ queryline = inserts[0]
174
+ for (i = 1; i <= length(refs); ++i) {
175
+ queryline = queryline targets[i] inserts[i]
176
+ }
177
+ printf("\n%s\n%s\n", refline, queryline)
178
+ }